好吧,之前用的是旧版的,现在出了个新版的,优先选择用新版的哈。
从官网下载相应的开发包,然后主要需要找到这几个东西添加到项目工程里面,1.Data文件夹 2.NLPIR_JNI.DLL 3.NLPIR.jar 4.nlpir.properties
添加完那些东西后,需要配置的东西主要为nlpir.properties文件,大概内容如下:
dll_or_so_path=D\:\\Spiliter\\NLPIR_JNI.dll
data_dir_parent_path=.
主要要配置dll_or_so_path,里面的路径为项目里面NLPIR_JNI.dll的绝对路径
搞定后就可以开始写代码了:
package shell; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.net.URI; import java.net.URISyntaxException; import java.util.Properties; import kevin.zhang.NLPIR; public class Spliter { NLPIR nlpir = null; public Spliter(){ this.nlpir = new NLPIR(); String argu = "././"; try { if (this.nlpir.NLPIR_Init(argu.getBytes("GB2312"),0,"0".getBytes("GB2312")) == false){ System.out.println("init failed"); } } catch (UnsupportedEncodingException e) { e.printStackTrace(); } } public void SliptFile(String file,int tag){ String encoding = "utf8"; try { String of = file.replace(".", "_new."); FileOutputStream out = null; out = new FileOutputStream(new File(of)); InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding); BufferedReader bw = new BufferedReader(read); String line = null; while((line = bw.readLine()) != null){ System.out.println(line); String newLine = getSplitString(line,tag); out.write((newLine + "\r\n").getBytes("utf8") ); } bw.close(); out.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public String getSplitString(String text,int tag){ String splitStr = null; byte nativeBytes[]; try { nativeBytes = nlpir.NLPIR_ParagraphProcess(text.getBytes("gbk"), tag); //因为分词是用c++编写的所以最后有一个\0,这边就不需要了,不然转码有问题的 splitStr = new String(nativeBytes, 0, nativeBytes.length - 1, "gbk"); System.out.println("分词结果为: " + splitStr); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } return splitStr; } public static void main(String[] args){ Spliter s = new Spliter(); s.SliptFile("d:\\w.txt",0); } }