#训练模型,找出模型的最佳迭代次数,即为4轮最佳
import pickle
from tensorflow.keras.layers import Flatten,Activation,Dense, SpatialDropout1D,Embedding,LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import jieba #用来分词
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 加载分词字典
with open('word_dict.pickle', 'rb') as handle:
word2index = pickle.load(handle)
### 准备数据
MAX_FEATURES = 40002 # 最大词频数
MAX_SENTENCE_LENGTH = 100 # 句子最大长度
num_recs = 0 # 样本数
with open("Corpus.txt", "r", encoding="utf-8",errors='ignore') as f:
for line in f: #遍历数据集的每一行
num_recs += 1
f.close()
# 初始化句子数组和label数组
X = np.empty(num_recs,dtype=list)
y = np.zeros(num_recs)
i=0
with open("Corpus.txt", "r", encoding="utf-8",errors='ignore') as f:
for line in f:
comment , label = line.split(",")
sentence = comment.replace(' ', '')
words = jieba.cut(sentence)
seqs = []
for word in words:
# 在词频中
if word in word2index:
seqs.append(word2index[word])
else:
seqs.append(word2index["UNK"]) # 不在词频内的补为UNK
X[i] = seqs
y[i] = int(label)
i += 1
f.close()
# 把句子转换成数字序列,并对句子进行统一长度,长的截断,短的补0
X = sequence.pad_sequences(X, maxlen=MAX_SENTENCE_LENGTH)
# 使用pandas对label进行one-hot编码
y1 = pd.get_dummies(y).values
print(X.shape)
print(y1.shape)
# 数据划分
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y1, test_size=0.3, random_state=0)
## 网络构建
EMBEDDING_SIZE = 256 # 词向量维度
HIDDEN_LAYER_SIZE = 128 # 隐藏层大小
BATCH_SIZE = 64 # 每批大小
NUM_EPOCHS = 10 # 训练周期数
# 创建一个实例
model = Sequential()
# 构建词向量
model.add(Embedding(MAX_FEATURES, EMBEDDING_SIZE,input_length=MAX_SENTENCE_LENGTH))
model.add(LSTM(HIDDEN_LAYER_SIZE, dropout=0.1, return_sequences=True))
model.add(LSTM(64, return_sequences=True))
#model.add(layers.Dropout(0.1))
model.add(Flatten())
model.add(Dense(2)) #[0, 1] or [1, 0]
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()
history=model.fit(Xtrain, ytrain, epochs=10, batch_size=BATCH_SIZE, validation_data=(Xtest, ytest))
model.save('my_model.h5')
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()