基于Python的电影评论数据分析论文 python电影评论的情感分析

转载

mob64ca140c3859 2023-09-04 14:43:47

文章标签 python 深度学习 tensorflow rnn 算法 文章分类 Python 后端开发

情感分析涉及分析句子或文本文档所表达的想法。现用多对一的体系结构来实现多层RNN以用于情感分析。

输入或输出的数据将会属于以下三种不同的类别：
·多对一：输入数据是一个序列，但输出数据不是序列而是固定的向量。例如，情感分析的输入基于文本，而输出是分类标签。
·一对多：输入数据是标准格式，不是序列，而输出数据是序列。一个例子是图像字幕，输入是图像，输出是英语短语。
·多对多：输入和输出阵列都是序列。可以根据输入和输出是否同步来进一步划分该类别。同步多对多建模任务的例子是视频分类，标记视频的每帧。延迟多对多的例子是把一种语言翻译成另一种语言。例如，一个完整的英语句子必须在机器翻译成德语之前先被机器阅读和处理。

import pandas as pd
import pyprind
from string import punctuation
import re
import numpy as np
import os
from collections import Counter
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
# import tensorflow as tf

df = pd.read_csv('xxx\movie_data.csv',
                 encoding='utf-8')
print(df.head(3))

# 将文本转换成整数列表
counts = Counter()
pbar = pyprind.ProgBar(len(df['review']), title='Counting words occurences')
for i,review in enumerate(df['review']):
    # join()方法将序列中的元素以指定的字符连接生成一个新的字符串。
    # 标点符号两边加空格,方便后续将单词分割出来
    text = ''.join([c if c not in punctuation else ' '+c+' ' for c in review]).lower()
    df.loc[i,'review'] = text
    pbar.update()
    # Counter类定义一个counts对象，该对象收集文本中每个独立单词出现的频率。
    counts.update(text.split())

## Create a mapping:
## Map each unique word to an integer
# 基于词频对独立词排序，然后进行映射
# counts.get方法获取元素出现的次数
word_counts = sorted(counts, key=counts.get, reverse=True)
print(word_counts[:5])
word_to_int = {word: ii for ii, word in enumerate(word_counts, 1)}

# 将文本转换成整数列表
mapped_reviews = []
pbar = pyprind.ProgBar(len(df['review']), title='Map reviews to ints')
for review in df['review']:
    mapped_reviews.append([word_to_int[word] for word in review.split()])
    pbar.update()

# 创建相同长度的序列，生成与RNN体系结构兼容的输入数据
sequence_length = 200  ## sequence length (or T in our formulas)
sequences = np.zeros((len(mapped_reviews), sequence_length), dtype=int)
for i, row in enumerate(mapped_reviews):
    review_arr = np.array(row)
    sequences[i, -len(row):] = review_arr[-sequence_length:]

# 数据集已经洗牌，可以简单地将数据集的前半部分用于训练，后半部分用于测试
X_train = sequences[:25000, :]
y_train = df.loc[:25000, 'sentiment'].values
X_test = sequences[25000:, :]
y_test = df.loc[25000:, 'sentiment'].values

# 小批量
np.random.seed(123) # for reproducibility

## Function to generate minibatches:
def create_batch_generator(x, y=None, batch_size=64):
    # python中与除法相关的三个运算符是// 和 / 和 %，下面逐一介绍。
    # “/”，这是传统的除法，3/2=1.5
    # “//”，在python中，这个叫“地板除”，3//2=1
    # “%”，这个是取模操作，也就是区余数，4%2=0，5%2=1
    n_batches = len(x)//batch_size
    x= x[:n_batches*batch_size]
    if y is not None:
        y = y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        if y is not None:
            yield x[ii:ii+batch_size], y[ii:ii+batch_size]
        else:
            yield x[ii:ii+batch_size]

# 构建一个RNN模型
class SentimentRNN(object):
    def __init__(self, n_words, seq_len=200,
                 lstm_size=256, num_layers=1, batch_size=64,
                 learning_rate=0.0001, embed_size=200):
        self.n_words = n_words
        self.seq_len = seq_len
        self.lstm_size = lstm_size  ## number of hidden units
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.embed_size = embed_size

        self.g = tf.Graph()
        with self.g.as_default():
            tf.set_random_seed(123)
            self.build()
            self.saver = tf.train.Saver()
            self.init_op = tf.global_variables_initializer()

    def build(self):
        ## Define the placeholders
        tf_x = tf.placeholder(tf.int32,
                              shape=(self.batch_size, self.seq_len),
                              name='tf_x')
        tf_y = tf.placeholder(tf.float32,
                              shape=(self.batch_size),
                              name='tf_y')
        tf_keepprob = tf.placeholder(tf.float32,
                                     name='tf_keepprob')
        ## Create the embedding layer
        # 增加嵌入层并构建嵌入式表示embed_x
        # 创建大小为[n_words×embedding_size]的矩阵作为张量变量，
        # 该变量被称为embedding，用[-1，1]之间的随机浮点数来初始化其元素
        embedding = tf.Variable(
            tf.random_uniform(
                (self.n_words, self.embed_size),
                minval=-1, maxval=1),
            name='embedding')
        # tf.nn.embedding_lookup函数在嵌入矩阵中查找与tf_x的每个元素相关联的行
        embed_x = tf.nn.embedding_lookup(
            embedding, tf_x,
            name='embeded_x')

        ## Define LSTM cell and stack them together
        # 首先定义多层RNN单元
        # 用f.contrib.rnn.BasicLSTMCell来创建RNN单元
        # 用tf.contrib.rnn.DropoutWrapper对RNN单元应用淘汰策略
        # 调用MultiRNNCell封装类堆叠起来形成多层RNN
        # 这里的cell即为一个LSTM网络
        cells = tf.nn.rnn_cell.MultiRNNCell(
            [tf.nn.rnn_cell.DropoutWrapper(
                tf.nn.rnn_cell.BasicLSTMCell(self.lstm_size),
                output_keep_prob=tf_keepprob)
                for i in range(self.num_layers)])

        # 定义CNN单元的初态
        ## Define the initial state:
        # 这里批量处理100个句子，LSTM网络的神经元为128个，每个句子对应128个状态值。
        self.initial_state = cells.zero_state(
            self.batch_size, tf.float32)
        # << initial state >>  (LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros:0' shape=(100, 128) dtype=float32>,
        # h=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros_1:0' shape=(100, 128) dtype=float32>),)
        print('  << initial state >> ', self.initial_state)

        # 用RNN单元及其初始化值创建RNN
        # 用tf.nn.dynamic_rnn函数组合所有的组件，
        # 整合嵌入数据、RNN单元及其初态，并根据LSTM单元所展现的体系结构为其创建管道
        # 处理完一个小批量之后，调用tf.nn.dynamic_rnn函数，将 状态更新为终态。更新后的状态将用于执行下一个小批量。
        # 反复进行该过程 并在整个迭代过程中不断地更新当前的状态。
        lstm_outputs, self.final_state = tf.nn.dynamic_rnn(
            cells, embed_x,
            initial_state=self.initial_state)
        ## Note: lstm_outputs shape:
        # 参考：
        #   https://www.jianshu.com/p/79021e23d683?utm_campaign=maleskine&utm_content=note&utm_medium=seo_notes&utm_source=recommendation
        ##  [batch_size, max_time, cells.output_size]
        # << lstm_output   >>  Tensor("rnn/transpose_1:0", shape=(100, 200, 128), dtype=float32)
        print('\n  << lstm_output   >> ', lstm_outputs)
        # << final state   >>  (LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_3:0' shape=(100, 128) dtype=float32>,
        # h=<tf.Tensor 'rnn/while/Exit_4:0' shape=(100, 128) dtype=float32>),)
        print('\n  << final state   >> ', self.final_state)

        ## Apply a FC layer after on top of RNN output:
        logits = tf.layers.dense(
            inputs=lstm_outputs[:, -1],  # 100*128
            units=1, activation=None,
            name='logits')

        logits = tf.squeeze(logits, name='logits_squeezed')  # 二维变一维
        # << logits        >>  Tensor("logits_squeezed:0", shape=(100,), dtype=float32)
        print('\n  << logits        >> ', logits)

        # 应用sigmoid函数可以将输出压缩至0～1的范围
        y_proba = tf.nn.sigmoid(logits, name='probabilities')
        # tf.cast():将x的数据格式转化成dtype数据类型
        # tf.round():四舍五入
        predictions = {
            'probabilities': y_proba,
            'labels': tf.cast(tf.round(y_proba), tf.int32,
                              name='labels')
        }
        # << predictions   >>  {'probabilities': <tf.Tensor 'probabilities:0' shape=(100,) dtype=float32>,
        # 'labels': <tf.Tensor 'labels:0' shape=(100,) dtype=int32>}
        print('\n  << predictions   >> ', predictions)

        ## Define the cost function
        # tf.nn.sigmoid_cross_entropy_with_logits()预测越准确，结果的值越小
        # tf.reduce_mean操作，对向量求均值
        cost = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                labels=tf_y, logits=logits),
            name='cost')

        ## Define the optimizer
        # tf.train.AdamOptimizer()函数是Adam优化算法：是一个寻找全局最优点的优化算法，引入了二次方梯度校正。
        # Adam优化器是一个是强大的基于梯度的优化方法，适合非顶点优化和机器学习。
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        train_op = optimizer.minimize(cost, name='train_op')

    def train(self, X_train, y_train, num_epochs):
        with tf.Session(graph=self.g) as sess:
            sess.run(self.init_op)
            iteration = 1
            for epoch in range(num_epochs):
                state = sess.run(self.initial_state)

                for batch_x, batch_y in create_batch_generator(
                        X_train, y_train, self.batch_size):
                    feed = {'tf_x:0': batch_x,
                            'tf_y:0': batch_y,
                            'tf_keepprob:0': 0.5,
                            self.initial_state: state}
                    loss, _, state = sess.run(
                        ['cost:0', 'train_op',
                         self.final_state],
                        feed_dict=feed)

                    if iteration % 20 == 0:
                        print("Epoch: %d/%d Iteration: %d "
                              "| Train loss: %.5f" % (
                                  epoch + 1, num_epochs,
                                  iteration, loss))

                    iteration += 1
                if (epoch + 1) % 10 == 0:
                    self.saver.save(sess,
                                    "model/sentiment-%d.ckpt" % epoch)

    # 与train方法类似，需要不断地更新当前的状态
    def predict(self, X_data, return_proba=False):
        preds = []
        with tf.Session(graph=self.g) as sess:
            # tf.train.latest_checkpoint()自动寻找最新的checkpoint
            self.saver.restore(
                sess, tf.train.latest_checkpoint('model/'))
            test_state = sess.run(self.initial_state)
            for ii, batch_x in enumerate(
                    create_batch_generator(
                        X_data, None, batch_size=self.batch_size), 1):
                feed = {'tf_x:0': batch_x,
                        'tf_keepprob:0': 1.0,
                        self.initial_state: test_state}
                if return_proba:
                    pred, test_state = sess.run(
                        ['probabilities:0', self.final_state],
                        feed_dict=feed)
                else:
                    pred, test_state = sess.run(
                        ['labels:0', self.final_state],
                        feed_dict=feed)

                preds.append(pred)

        return np.concatenate(preds)

## Train:
# 设置参数n_words使其等于独立单词的数目+1（加上1是因为当序列长度小于200时用0来填充）
n_words = max(list(word_to_int.values())) + 1

rnn = SentimentRNN(n_words=n_words,
                   seq_len=sequence_length,
                   embed_size=256,
                   lstm_size=128,  # lstm_size决定了每个RNN层中隐藏单元的数量
                   num_layers=1,  # 设置num_layers=1来使用单层RNN
                   batch_size=100,
                   learning_rate=0.001)

# 40次迭代来训练模型
rnn.train(X_train, y_train, num_epochs=40)

## Test:
preds = rnn.predict(X_test)
y_true = y_test[:len(preds)]
print('Test Acc.: %.3f' % (
      np.sum(preds == y_true) / len(y_true)))

## Get probabilities:
proba = rnn.predict(X_test, return_proba=True)

跑的时间稍长，因为IMDb电影评论数据量很大。

运行结果：
review sentiment
0 In 1974, the teenager Martha Moxley (Maggie Gr… 1
1 OK… so… I really like Kris Kristofferson a… 0
2 SPOILER Do not read this, if you think a… 0

Counting words occurences
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:35
[‘the’, ‘.’, ‘,’, ‘and’, ‘a’]
Map reviews to ints
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:02

<< initial state >> (LSTMStateTuple(c=<tf.Tensor ‘MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros:0’ shape=(100, 128) dtype=float32>, h=<tf.Tensor ‘MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros_1:0’ shape=(100, 128) dtype=float32>),)

<< lstm_output >> Tensor(“rnn/transpose_1:0”, shape=(100, 200, 128), dtype=float32)

<< final state >> (LSTMStateTuple(c=<tf.Tensor ‘rnn/while/Exit_3:0’ shape=(100, 128) dtype=float32>, h=<tf.Tensor ‘rnn/while/Exit_4:0’ shape=(100, 128) dtype=float32>),)

<< logits >> Tensor(“logits_squeezed:0”, shape=(100,), dtype=float32)

<< predictions >> {‘probabilities’: <tf.Tensor ‘probabilities:0’ shape=(100,) dtype=float32>, ‘labels’: <tf.Tensor ‘labels:0’ shape=(100,) dtype=int32>}

Epoch: 1/40 Iteration: 20 | Train loss: 0.70443
Epoch: 1/40 Iteration: 40 | Train loss: 0.58803
Epoch: 1/40 Iteration: 60 | Train loss: 0.64603
Epoch: 1/40 Iteration: 80 | Train loss: 0.55665
Epoch: 1/40 Iteration: 100 | Train loss: 0.53824
Epoch: 1/40 Iteration: 120 | Train loss: 0.54014
Epoch: 1/40 Iteration: 140 | Train loss: 0.60872
Epoch: 1/40 Iteration: 160 | Train loss: 0.54468
Epoch: 1/40 Iteration: 180 | Train loss: 0.56579
Epoch: 1/40 Iteration: 200 | Train loss: 0.46205
Epoch: 1/40 Iteration: 220 | Train loss: 0.36559
Epoch: 1/40 Iteration: 240 | Train loss: 0.46353
…
Epoch: 40/40 Iteration: 9760 | Train loss: 0.00081
Epoch: 40/40 Iteration: 9780 | Train loss: 0.00019
Epoch: 40/40 Iteration: 9800 | Train loss: 0.00124
Epoch: 40/40 Iteration: 9820 | Train loss: 0.00006
Epoch: 40/40 Iteration: 9840 | Train loss: 0.00017
Epoch: 40/40 Iteration: 9860 | Train loss: 0.00005
Epoch: 40/40 Iteration: 9880 | Train loss: 0.00036
Epoch: 40/40 Iteration: 9900 | Train loss: 0.00004
Epoch: 40/40 Iteration: 9920 | Train loss: 0.00032
Epoch: 40/40 Iteration: 9940 | Train loss: 0.00011
Epoch: 40/40 Iteration: 9960 | Train loss: 0.00173
Epoch: 40/40 Iteration: 9980 | Train loss: 0.00032
Epoch: 40/40 Iteration: 10000 | Train loss: 0.00009