文章目录

  • 一、项目目录与地址
  • 二、Alink 中文情感分析:微博评论情感分析
  • 三、HanLP 中文情感分析
  • 四、SparkML 中文情感分类(待定)


一、项目目录与地址

Github:https://github.com/Zhifa-Liu/EmotionClassDemo

java进行情感 分析 java文本情感分析_文本分类

  • cn.edu.neu.alink:Alink 中文情感分析
  • cn.edu.neu.bayes:在 https://github.com/marwincn/pubsenti-finder 代码基础上,略作修改后的贝叶斯情感分类,效果似乎不太好,不予介绍
  • cn.edu.neu.hanlp:HanLP 中文情感分析
  • cn.edu.neu.sparkml:SparkML 中文情感分析,待定
  • cn.edu.neu.zoom.data:中文情感分析(文本分类)使用的数据集
  • 中文情感挖掘语料-ChnSentiCorp(谭松波)
  • 搜狗文本分类语料库迷你版
  • 微博评论情感数据集:weibo_senti_100k.csv
  • cn.edu.neu.zoom.model:保存的情感分析模型

数据集中文情感挖掘语料-ChnSentiCorp(谭松波)与搜狗文本分类语料库迷你版的下载链接可以从以下地址找到,另一个直接百度搜索即可:
https://github.com/hankcs/HanLP/wiki/%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E4%B8%8E%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90#%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90

二、Alink 中文情感分析:微博评论情感分析

package cn.edu.neu.alink;

import cn.edu.neu.alink.cons.ClassifierConstant;
import com.alibaba.alink.operator.batch.BatchOperator;
import com.alibaba.alink.operator.batch.source.CsvSourceBatchOp;
import com.alibaba.alink.operator.batch.source.TextSourceBatchOp;
import com.alibaba.alink.pipeline.LocalPredictor;
import com.alibaba.alink.pipeline.Pipeline;
import com.alibaba.alink.pipeline.PipelineModel;
import com.alibaba.alink.pipeline.classification.LogisticRegression;
import com.alibaba.alink.pipeline.classification.NaiveBayesTextClassifier;
import com.alibaba.alink.pipeline.dataproc.Imputer;
import com.alibaba.alink.pipeline.nlp.DocCountVectorizer;
import com.alibaba.alink.pipeline.nlp.Segment;
import com.alibaba.alink.pipeline.nlp.StopWordsRemover;
import org.apache.flink.types.Row;

import java.io.File;
import java.util.List;

/**
 * @author 32098
 */
public class CommentClassifier {
    private static PipelineModel pipelineModel;

    public static void initNaiveBayesModel(){
        pipelineModel = PipelineModel.load(ClassifierConstant.WEIBO_LR_MODEL_PATH);
        if(pipelineModel==null){
            System.err.println("载入模型失败...");
            System.out.println("开始构建模型...");
            BatchOperator<?> sourceBatchOp = getCommentSourceOp();
            Pipeline pipeline = new Pipeline(
                    // 缺失值填充:null
                    new Imputer().setSelectedCols("review").setOutputCols("featureText").setStrategy("value").setFillValue("null"),
                    // 分词操作
                    new Segment().setSelectedCol("featureText"),
                    // 去除停用词
                    new StopWordsRemover().setSelectedCol("featureText"),
                    /*
                     * TF, Term Frequency: 词频,生成特征向量的类型
                     * https://www.yuque.com/pinshu/alink_doc/7a529b8564228c01c31f2fa58c43f782
                     */
                    new DocCountVectorizer().setFeatureType("TF").setSelectedCol("featureText").setOutputCol("featureVector"),
                    new NaiveBayesTextClassifier().setVectorCol("featureVector").setLabelCol("label").setPredictionCol("pred")
            );
            pipelineModel = pipeline.fit(sourceBatchOp);
            pipelineModel.save(ClassifierConstant.WEIBO_NB_MODEL_PATH);
            try {
                // save 方法是将模型连接到了 sink 组件,还需要等到 BatchOperator.execute(),才会真正写出模型
                BatchOperator.execute();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        System.out.println("模型构建成功!");
    }

    public static void initLogisticRegressionModel(){
        pipelineModel = PipelineModel.load(ClassifierConstant.WEIBO_LR_MODEL_PATH);
        if(pipelineModel==null){
            System.err.println("载入模型失败...");
            System.out.println("开始构建模型...");
            BatchOperator<?> sourceBatchOp = getCommentSourceOp();
            Pipeline pipeline = new Pipeline(
                    // 缺失值填充:null
                    new Imputer().setSelectedCols("review").setOutputCols("featureText").setStrategy("value").setFillValue("null"),
                    // 分词操作
                    new Segment().setSelectedCol("featureText"),
                    // 去除停用词
                    new StopWordsRemover().setSelectedCol("featureText"),
                    /*
                     * TF, Term Frequency: 词频,生成特征向量的类型
                     * https://www.yuque.com/pinshu/alink_doc/7a529b8564228c01c31f2fa58c43f782
                     */
                    new DocCountVectorizer().setFeatureType("TF").setSelectedCol("featureText").setOutputCol("featureVector"),
                    new LogisticRegression().setVectorCol("featureVector").setLabelCol("label").setPredictionCol("pred")
            );
            pipelineModel = pipeline.fit(sourceBatchOp);
            pipelineModel.save(ClassifierConstant.WEIBO_NB_MODEL_PATH);
            try {
                // save 方法是将模型连接到了 sink 组件,还需要等到 BatchOperator.execute(),才会真正写出模型
                BatchOperator.execute();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        System.out.println("模型构建成功!");
    }

    private static BatchOperator<?> getCommentSourceOp(){
        return new CsvSourceBatchOp()
                .setFilePath(ClassifierConstant.DATASET_WEIBO_PATH)
                .setSchemaStr("label int, review string")
                .setIgnoreFirstLine(true);
    }

    public static String getClassification(String text){
        if(pipelineModel==null){
            System.err.println("As you didn't call initNaiveBayesModel() or initLogisticRegressionModel() before using getClassification(String text),\n" +
                    "we will call initNaiveBayesModel() to set value for our inner attribute (pipelineModel) to get your text's Classification");
            initNaiveBayesModel();
        }
        try {
            // 
            LocalPredictor localPredictor = pipelineModel.collectLocalPredictor("review string");
            // System.out.print(localPredictor.getOutputSchema());
            Row row = Row.of(text);
            return String.valueOf(localPredictor.map(row).getField(3));
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    public static void main(String[] args) throws Exception {
        // Can't, we will use LocalPredictor
//        initNaiveBayesModel();
//        System.out.println("------------------------------");
//        TextSourceBatchOp textSourceBatchOp1 = new TextSourceBatchOp()
//                .setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/neg.txt".replace("/", File.separator))
//                .setTextCol("review");
//        pipelineModel.transform(textSourceBatchOp1).select(new String[]{"label", "pred", "review"}).sampleWithSize(20).print();
//
//        initLogisticRegressionModel();
//        System.out.println("------------------------------");
//        TextSourceBatchOp textSourceBatchOp2 = new TextSourceBatchOp()
//                .setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/pos.txt".replace("/", File.separator))
//                .setTextCol("review");
//        pipelineModel.transform(textSourceBatchOp2).select(new String[]{"label", "pred", "review"}).sampleWithSize(20).print();

        System.out.println(getClassification("你真好"));
        System.out.println(getClassification("哇哦今年的春夏季衣服不错诶"));

        TextSourceBatchOp textSourceBatchOp1 = new TextSourceBatchOp()
                .setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/neg.txt".replace("/", File.separator))
                .setTextCol("review");
        TextSourceBatchOp textSourceBatchOp2 = new TextSourceBatchOp()
                .setFilePath(System.getProperty("user.dir")+"/src/main/java/cn/edu/neu/zoom/data/pos.txt".replace("/", File.separator))
                .setTextCol("review");
        List<Row> negRows = textSourceBatchOp1.getDataSet().collect();
        List<Row> posRows = textSourceBatchOp2.getDataSet().collect();

        int acc = 0;
        for (Row negRow : negRows) {
            // except to be 0
            String text = getClassification((String) negRow.getField(0));
            System.out.println(text);
            if("0".equals(text)){
                acc+=1;
            }
        }
        for (Row posRow : posRows) {
            // except to be 1
            String text = getClassification((String) posRow.getField(0));
            System.out.println(text);
            if("0".equals(text)){
                acc+=1;
            }
        }
        System.out.println("Acc: "+(double) acc/(negRows.size()+posRows.size()));
    }
}

java进行情感 分析 java文本情感分析_java进行情感 分析_02


  这个分类感觉有点慢!!!

三、HanLP 中文情感分析

HanLP git:https://github.com/hankcs/HanLP/tree/doc-zh HanLP 中文情感分析:https://github.com/hankcs/HanLP/wiki/%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E4%B8%8E%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90#%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90

  通过 HanLP的NaiveBayesClassifier与HanLPTokenizer实现的微博评论情感分析、酒店评论情感分析、文本分类:

package cn.edu.neu.hanlp;

import cn.edu.neu.hanlp.cons.ClassifierConstant;
import com.hankcs.hanlp.classification.classifiers.AbstractClassifier;
import com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier;
import com.hankcs.hanlp.classification.corpus.FileDataSet;
import com.hankcs.hanlp.classification.corpus.IDataSet;
import com.hankcs.hanlp.classification.models.AbstractModel;
import com.hankcs.hanlp.classification.models.NaiveBayesModel;
import com.hankcs.hanlp.classification.tokenizers.HanLPTokenizer;

import java.io.*;
import java.util.Map;

/**
 * @author 32098
 */
public class HanLpClassifier {
    private static AbstractClassifier classifier = null;

    /**
     *
     * @param dataPath 数据路径
     * @param modelPath 模型路径
     */
    public static void initClassifier(String dataPath, String modelPath){
        AbstractModel model = loadModel(modelPath);
        if(model==null){
            System.out.println("No model find, begin train model!");
            IDataSet dataSet = null;
            try {
                System.out.println(dataPath);

                File f = new File(dataPath);
                if(f.isFile()){
                    BufferedReader reader = new BufferedReader(new FileReader(dataPath));
                    String str;
                    dataSet = new FileDataSet().setTokenizer(new HanLPTokenizer());
                    System.out.println("Prepare dataset!");
                    // ignore first line
                    str = reader.readLine();
                    while ((str=reader.readLine())!=null){
                        dataSet.add(str.substring(0,1), str.substring(2));
                    }
                }else{
                    dataSet = new FileDataSet().setTokenizer(new HanLPTokenizer()).load(dataPath, "UTF-8");
                }
                System.out.println("Dataset prepared!");
            } catch (IOException e) {
                e.printStackTrace();
            }
            classifier = new NaiveBayesClassifier();
            classifier.train(dataSet);
            model = classifier.getModel();
            saveModel(modelPath, model);
        }else{
            System.out.println("NaiveBayesModel init succeeded!");
            classifier = new NaiveBayesClassifier((NaiveBayesModel) model);
        }
    }

    private static void saveModel(String modelPath, AbstractModel model){
        try (ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(modelPath))) {
            oos.writeObject(model);
            System.out.println("Save NaiveBayesModel Succeeded!");
        } catch (Exception e) {
            System.err.println("Save NaiveBayesModel Failed!");
            System.err.println(e.getMessage());
        }
    }

    private static AbstractModel loadModel(String modelPath){
        try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(modelPath))) {
            Object o = ois.readObject();
            return (AbstractModel) o;
        } catch (FileNotFoundException e) {
            System.err.println("Load NaiveBayesModel Failed(NaiveBayesModel file:" + modelPath+" not Found!)");
        } catch (Exception e) {
            System.err.println(e.getMessage());
        }
        return null;
    }

    public static Double getScoreOfWeiboComment(String sentence){
        if(classifier==null){
            System.err.println("Classifier is null, default using weibo comment data to init classifier");
            System.out.println("If you want to use different data to init classifier, call initClassifier first");
            initClassifier(ClassifierConstant.DATASET_WEIBO_PATH, ClassifierConstant.WEIBO_MODEL_PATH);
        }
        Map<String, Double> map = classifier.predict(sentence);
        return map.get("1") - map.get("0");
    }

    public static String getClassification(String sentence) {
        if(classifier==null){
            System.err.println("Classifier is null, default using weibo comment data to init classifier");
            System.out.println("If you want to use different data to init classifier, call initClassifier first");
            initClassifier(ClassifierConstant.DATASET_WEIBO_PATH, ClassifierConstant.WEIBO_MODEL_PATH);
        }
        Map<String, Double> map = classifier.predict(sentence);
        // System.out.println(map);
        return classifier.classify(sentence);
    }
}
package cn.edu.neu.hanlp;

import cn.edu.neu.hanlp.cons.ClassifierConstant;

/**
 * @author 32098
 *
 * 情感分类、中文文本分类
 */
public class Test {
    public static void main(String[] args) {
        HanLpClassifier.initClassifier(ClassifierConstant.DATASET_WEIBO_PATH, ClassifierConstant.WEIBO_MODEL_PATH);
        System.out.println(HanLpClassifier.getClassification("天安门"));
        System.out.println(HanLpClassifier.getClassification("哇哦今年的春夏季衣服不错诶"));
        System.out.println(HanLpClassifier.getClassification("去死吧"));
        System.out.println(HanLpClassifier.getClassification("加油"));
        System.out.println(HanLpClassifier.getClassification("你真好"));
        System.out.println(HanLpClassifier.getScoreOfWeiboComment("你真好"));

        HanLpClassifier.initClassifier(ClassifierConstant.DATASET_HOTEL_PATH, ClassifierConstant.HOTEL_MODEL_PATH);
        System.out.println(HanLpClassifier.getClassification("酒店太差了"));

        HanLpClassifier.initClassifier(ClassifierConstant.DATASET_SOUGOU_PATH, ClassifierConstant.SOUGOU_MODEL_PATH);
        System.out.println(HanLpClassifier.getClassification("篮球、羽毛球"));
    }
}

  运行结果:

java进行情感 分析 java文本情感分析_机器学习_03

四、SparkML 中文情感分类(待定)

暂略