本文共 2471 字,大约阅读时间需要 8 分钟。
接着学习wvtool,实现wvtool的分词功能,话不多述,直接上代码吧!
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileReader;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.io.Reader;
- import java.io.StringReader;
-
- import edu.udo.cs.wvtool.config.WVTConfiguration;
- import edu.udo.cs.wvtool.generic.inputfilter.SelectingInputFilter;
- import edu.udo.cs.wvtool.generic.loader.UniversalLoader;
- import edu.udo.cs.wvtool.generic.tokenizer.NGramTokenizer;
- import edu.udo.cs.wvtool.generic.tokenizer.SimpleTokenizer;
- import edu.udo.cs.wvtool.generic.wordfilter.StopWordFilterFile;
- import edu.udo.cs.wvtool.main.WVTDocumentInfo;
- import edu.udo.cs.wvtool.util.TokenEnumeration;
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- public class Demo01_1 {
- public static String pathString="D:\\工作管理\\weka学习\\wvtool-1.1\\wvtool-1.1\\examples\\data\\";
- public static void main(String[] args) throws Exception {
-
-
-
- UniversalLoader loader=new UniversalLoader();
-
-
-
- WVTDocumentInfo info=new WVTDocumentInfo(pathString+"text.html", "html", "utf-8", "chinese");
-
-
-
- InputStream stream=loader.loadDocument(info);
-
-
-
-
-
-
-
-
-
- SelectingInputFilter filter=new SelectingInputFilter();
-
-
-
- Reader readers=filter.convertToPlainText(stream,info);
-
-
-
- BufferedReader reader=new BufferedReader(readers);
-
- String string=reader.readLine().toString();
-
-
- string=string.replace(string.valueOf((char)9), "");
-
- Reader reader2=new StringReader(string);
-
-
-
-
-
-
- NGramTokenizer tokenizer=new NGramTokenizer(2,new SimpleTokenizer());
-
- TokenEnumeration enumeration=tokenizer.tokenize(reader2, info);
-
- while(enumeration.hasMoreTokens()){
- System.out.print(enumeration.nextToken()+"|");
- }
-
- }
现在想想,其实还真不怎么难……
如果真的想实现文本的分词,我个人觉得IK_Analyzer和ICT_CLAS不错,特别是ICT_CLAS,不仅分词,还实现了词性的标注,人名的识别!
转载地址:http://lsppx.baihongyu.com/