
这个工具的Github地址是这里:https://github.com/ansjsun/ansj_seg 需要看源码的自己前往。下面说说如何使用这个工具提取关键词。





1. public class Keyword implements Comparable<Keyword> {  
2. private String name;  
3. private double score;  
4. private double idf;  
5. private int freq;  
7. public Keyword(String name, int docFreq, int weight) {  
8. this.name = name;  
9. this.idf = Math.log(10000 + 10000.0 / (docFreq + 1));  
10. this.score = idf * weight;  
11.         freq++;  
12.     }  
13. public void updateWeight(int weight) {  
14. this.score += weight * idf;  
15.         freq++;  
16.     }  
17. public int compareTo(Keyword o) {  
18. if (this.score < o.score) {  
19. return 1;  
20. else {  
21. return -1;  
22.         }  
24.     }  
25. public boolean equals(Object obj) {  
26. // TODO Auto-generated method stub  
27. if (obj instanceof Keyword) {  
28.             Keyword k = (Keyword) obj;  
29. return k.name.equals(name);  
30. else {  
31. return false;  
32.         }  
33.     }  
34. public String toString() {  
35. return name;  
36.     }  
37. //look here ******************************************************  
38. public double getScore(){  
39. return score;  
40.     }  
41. public int getFreq() {  
42. return freq;  
43.     }  
45. }


[java]  view plain  copy


1. import java.util.ArrayList;  
2. import java.util.Collection;  
3. import java.util.HashMap;  
4. import java.util.List;  
5. import java.util.Map;  
6. import java.util.TreeSet;  
8. import org.ansj.app.newWord.LearnTool;  
9. import org.ansj.domain.Term;  
10. import org.ansj.recognition.NatureRecognition;  
11. import org.ansj.splitWord.analysis.NlpAnalysis;  
13. public class KeyWordComputer {  
14. private int nKeyword = 10;  
15. //default constructor keyword number=10  
16. public KeyWordComputer() {  
17. 10;  
18.     }  
19. // constructor set keyword number  
20. public KeyWordComputer(int nKeyword) {  
21. this.nKeyword = nKeyword;  
23.     }  
24. //get keywords object list  
25. private List<Keyword> computeArticleTfidf(String content, int titleLength) {  
26. new HashMap<String, Keyword>();  
27. new LearnTool();  
28.         List<Term> parse = NlpAnalysis.parse(content, learn);  
29.         parse = NlpAnalysis.parse(content, learn);  
30. for (Term term : parse) {  
31. int weight = getWeight(term, content.length(), titleLength);  
32. if (weight == 0)

1. continue;  
2.             Keyword keyword = tm.get(term.getName());  
3. if (keyword == null) {  
4. new Keyword(term.getName(), term.getNatrue().allFrequency, weight);  
5.                 tm.put(term.getName(), keyword);  
6. else {  
7. 1);  
8.             }  
9.         }  
10. new TreeSet<Keyword>(tm.values());  
11. new ArrayList<Keyword>(treeSet);  
12. if (treeSet.size() < nKeyword) {  
13. return arrayList;  
14. else {  
15. return arrayList.subList(0, nKeyword);  
16.         }  
17.     }  
18. //get keywords,need title and content  
19. public Collection<Keyword> computeArticleTfidf(String title, String content) {  
20. return computeArticleTfidf(title + "\t" + content, title.length());  
21.     }  
22. //get keywords, just need content  
23. public Collection<Keyword> computeArticleTfidf(String content) {  
24. return computeArticleTfidf(content, 0);  
25.     }  
26. //get keywords weight  
27. private int getWeight(Term term, int length, int titleLength) {  
28. if (term.getName().matches("(?s)\\d.*")) {  
29. return 0;  
30.         }  
31. if (term.getName().trim().length() < 2) {  
32. return 0;  
33.         }  
34.         String pos = term.getNatrue().natureStr;  
35. if (!pos.startsWith("n") || "num".equals(pos)) {  
36. return 0;  
37.         }  
38. int weight = 0;  
39. if (titleLength > term.getOffe()) {  
40. return 20;  
41.         }  
42. // position  
43. double position = (term.getOffe() + 0.0) / length;  
44. if (position < 0.05)  
45. return 10;  
46. 5 - 5 * position);  
47. return weight;  
48.     }  
49. }



package com.zengxiaosen;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Iterator;

 * Created by zengxiaosen on 16/10/17.
 * 提取关键词语
public class KeyWordtest {
    public static void main(String[] args) throws IOException{
        String filePath = "./test-utf8.TXT";
        String tt = new String();
        BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(filePath)));
        String str;
        while((str = in.readLine()) != null){
            tt += str;

    public static void test1(String content){
        KeyWordComputer key = new KeyWordComputer(10);
        Iterator<Keyword> it = key.computeArticleTfidf(content).iterator();
            Keyword key2 = (Keyword)it.next();

在测试时有一点需要注意,ANSJ目前只支持UTF-8格式的字符串,其他格式的运行时会报JAVA GC错误,作者说因为处理过程中是乱码,程序以为都是新词导致内存崩溃了,希望他在以后的版本中加以改进吧。最后给出输出结果,这里是按照权重排序的:

[plain]  view plain  copy


    1. init ambiguity  waring :library/ambiguity.dic because : not find that file or can not to read !  
    2. init core library ok use time :3983  
    3. init ngram ok use time :2023  
    4. 屌丝528.8693014046396  
    5. 李毅202.62858475668492  
    6. 网络174.9965471938941  
    7. 球迷110.52413506982782  
    8. 群体110.52413506982782  
    9. 人人110.52413506982782  
    10. 名号101.31379048067551  
    11. 高富帅92.10390216212956  
    12. 满屏92.10390216212954  
    13. 网友92.1034458915232  
    14. *************************  
    15. 社会主义1446.0241004969153  
    16. 社会1326.289620837935  
    17. 中国1096.0347881537828  
    18. 人民1049.9792831633652  
    19. 文化874.9827359694709  
    20. 经济874.9827359694709  
    21. 特色847.3517022020139  
    22. 制度801.2999792562523  
    23. 体系746.0379117213383  
    24. 国家598.6723982949011