train_model
# -*- coding: utf-8 -*- from pandas import read_csv import re import datetime import jieba_fast as jieba import pickle import numpy as np from keras.utils import np_utils from keras.layers import Embedding size=120 embedding_dim=size max_len=47 path_of_fstop='D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/tool_packet_and_content/hlt_stop_words.txt' path_of_train_data="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/train_set_450000.csv" path_of_test_data="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/validation_set_50000.csv" path_of_add_data="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/add.csv" path_of_ciku='D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/easy_ciku5.txt' '''=================================================读取停用词=======================================''' stpwrdlst = {} fstop = open(path_of_fstop, encoding='utf-8', errors='ignore') for eachWord in fstop: eachWord=eachWord.strip("\n") stpwrdlst[eachWord] = eachWord fstop.close() print("导入停用词成功!!!") '''============================================导入训练数据和验证数据模块============================''' root1=path_of_train_data root2=path_of_test_data traindata1=read_csv(root1).values[:,1:] testdata=read_csv(root2).values[:,:] adddata=read_csv(path_of_add_data).values[:,:] traindata2=np.vstack((traindata1,testdata)) add_data=[] for line in adddata: if len(line[0])<90: ind=[] ind.append(line[0]) ind.append(line[1]) ind.append(line[2]) ind.append(line[3]) add_data.append(ind) add_data=np.array(add_data) traindata2=np.vstack((traindata2,add_data)) print("导入数据成功!!!") '''============================================分词模块============================''' '''jieba分词载入词库''' jieba.load_userdict(path_of_ciku) #载入词库 pre_time=datetime.datetime.now() trainci=[] for line in traindata2: wenben=line[0] results = re.sub('[’!"【】★#$%&\'!!+/·、()一“”,。()!《》!!!*+,-./\::;<=>??@[][\\]^_`{|}~]+',' ',wenben) wenben=jieba.lcut(results,cut_all=False) for ci in wenben: if ci in stpwrdlst or ci.isspace(): wenben.remove(ci) trainci.append(wenben) post_time=datetime.datetime.now() print("训练集分词结束,时间为: ",(post_time-pre_time).seconds*1.0) pre_time=datetime.datetime.now() testci=[] for line in testdata: wenben=line[0] results = re.sub('[’!"【】★#$%&\'!!+/·、()一“”,。()!《》!!!*+,-./\::;<=>??@[][\\]^_`{|}~]+',' ',wenben) wenben=jieba.lcut(results,cut_all=False) for ci in wenben: if ci in stpwrdlst or ci.isspace(): wenben.remove(ci) testci.append(wenben) post_time=datetime.datetime.now() print("验证集分词结束,时间为: ",(post_time-pre_time).seconds*1.0) print("分词结束!!!") '''=======================导入W2V的model和ci_dic,准备输入数据============================''' pre_time=datetime.datetime.now() from gensim.models import word2vec w2v_model = word2vec.Word2Vec.load("save_model/W2V_CI.model") with open('save_model/CI_dic.pkl', 'rb') as f: ci_dic=pickle.load(f) post_time=datetime.datetime.now() print("导入w2v完毕,时间为: ",(post_time-pre_time).seconds*1.0) '''=======================建立训练数据和测试数据============================''' pre_time=datetime.datetime.now() train_x=[] test_x=[] for line in trainci: ls=[] for ci in line: if ci in ci_dic: ls.append(ci_dic[ci]) if len(ls)==(max_len-1): break train_x.append(ls) for line in testci: ls=[] for ci in line: if ci in ci_dic: ls.append(ci_dic[ci]) test_x.append(ls) train_x2=[] for ls in train_x: train_x2.append([0]*(max_len - len(ls))+ls) train_x=np.array(train_x2) test_x2=[] for ls in test_x: test_x2.append([0]*(max_len - len(ls))+ls) test_x=np.array(test_x2) post_time=datetime.datetime.now() print("输入数据处理完毕,时间为: ",(post_time-pre_time).seconds*1.0) embedding_matrix = np.zeros((len(ci_dic) + 1, embedding_dim)) for word, i in ci_dic.items(): if word in w2v_model: embedding_matrix[i] = np.asarray(w2v_model[word], dtype='float32') embedding_layer = Embedding(len(ci_dic) + 1, embedding_dim, weights=[embedding_matrix],# 表示直接使用预训练的词向量 input_length=max_len, trainable=True)# False表示不对词向量微调 '''=========================导入label,处理标签===============================''' with open('save_model/Label.pkl', 'rb') as f: Label=pickle.load(f) k=len(Label) pre_time=datetime.datetime.now() trainy=traindata2[:,1:] testy=testdata[:,1:] train_y=[] for line in trainy: ls=[] label=str(line[0])+"--"+str(line[1])+"--"+str(line[2]) ls.append(label) train_y.append(ls) test_y=[] for line in testy: ls=[] label=str(line[0])+"--"+str(line[1])+"--"+str(line[2]) ls.append(label) test_y.append(ls) with open('save_model/Label.pkl', 'rb') as f: Label=pickle.load(f) k=len(Label) train_y2=[] for la in train_y: la=la[0] train_y2.append(Label[la]) test_y2=[] for la in test_y: la=la[0] test_y2.append(Label[la]) train_y2=np.array(train_y2) test_y2=np.array(test_y2) '''把标签变为k个长度,若为1,则在1处为1,剩下的都标为0,k为标签个数''' train_y = np_utils.to_categorical(train_y2, num_classes=k) test_y = np_utils.to_categorical(test_y2,num_classes=k) post_time=datetime.datetime.now() print("标签处理完毕,时间为: ",(post_time-pre_time).seconds*1.0) '''=========================★★★★★★★建立五个模型★★★★★★★===============================''' from keras.models import Sequential,Model#按层 from keras.layers import Dense,LSTM,GlobalAveragePooling1D,Bidirectional,Activation,BatchNormalization, Flatten,Dropout,Conv1D,MaxPooling1D,concatenate,Input import matplotlib.pyplot as plt import keras from keras.callbacks import ReduceLROnPlateau,EarlyStopping from keras import regularizers def Fasttext(): #loss: 0.0412 - acc: 0.9935 - val_loss: 0.6146 - val_acc: 0.8718 #test accuracy: 0.87298 model = Sequential() model.add(embedding_layer) model.add(GlobalAveragePooling1D()) model.add(Dense(k, activation='softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) return model def Bi_lstm(): L2=1 model = Sequential() model.add(embedding_layer) model.add(Dropout(0.2)) model.add(Bidirectional(LSTM(256, return_sequences=True), merge_mode='concat')) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Flatten()) model.add(Dense(256,activation='relu',kernel_regularizer=regularizers.l2(L2))) model.add(BatchNormalization()) model.add(Dense(k)) model.add(Activation('softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) #Adamax return model def TextCNN(): L2=0.01 seq = Input(shape=[max_len], name='x_seq') emb_comment = embedding_layer(seq) convs = [] filter_sizes=[7,5,3,1] for fsz in filter_sizes: l_conv = Conv1D(filters=256, kernel_size=fsz)(emb_comment) l_re = Activation('relu')(l_conv) l_pool = MaxPooling1D(max_len - fsz + 1)(l_re) l_pool = Flatten()(l_pool) convs.append(l_pool) merge = concatenate(convs, axis=1) merge=BatchNormalization()(merge) out= Dense(k,activation='softmax',kernel_regularizer=regularizers.l2(L2))(merge) model =Model([seq],out) model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(lr=0.0001),metrics=['acc']) return model def Lstm(): Adam_lr=0.001 l2=0.01 model = Sequential() model.add(embedding_layer) model.add(BatchNormalization()) model.add(LSTM(512, input_shape=(max_len,size))) model.add(Dense(512,activation='relu')) model.add(Dense(256,activation='relu')) model.add(BatchNormalization()) model.add(Dense(k,activation='softmax',kernel_regularizer=regularizers.l2(l2))) model.summary() model.compile(loss = 'categorical_crossentropy',optimizer=keras.optimizers.Adam(lr=Adam_lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False), metrics=['accuracy'],) return model def model_evaluate(model,test_x,test_y): print("\nTesting~~~~~~~~~~") loss,accuracy = model.evaluate(test_x,test_y) print('\ntest loss:',loss) print('\ntest accuracy:', accuracy) def model_fit(model): hist=model.fit(train_x,train_y, epochs=50, batch_size=128,validation_split=0.1,shuffle=True,callbacks=[reduce_lr,early_stop]) #训练2大批,每批32个 plt.plot(hist.history['loss'], label='train_loss') plt.plot(hist.history['val_loss'], label='val_loss') plt.legend() plt.show() return model Adam_lr=0.001 Min_delta=Adam_lr*1.0/10 reduce_lr =ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=0, mode='auto',min_delta=Min_delta, cooldown=0, min_lr=0.000001) early_stop=EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='auto') model_E_Fasttext = Fasttext() model_name="save_model/model_E_Fasttext" model_E_Fasttext=model_fit(model_E_Fasttext) model_E_Fasttext.save(model_name,include_optimizer=False) model_evaluate(model_E_Fasttext,test_x,test_y) model_E_Bi_lstm = Bi_lstm() model_name="model_E_Bi_lstm" model_E_Bi_lstm=model_fit(model_E_Bi_lstm) model_E_Bi_lstm.save("save_model/"+model_name,include_optimizer=False) model_evaluate(model_E_Bi_lstm,test_x,test_y) print("Bi_lstm 保存成功") model_E_Lstm =Lstm() model_name="model_E_lstm" model_E_Lstm=model_fit(model_E_Lstm) model_E_Lstm.save("save_model/"+model_name,include_optimizer=False) model_evaluate(model_E_Lstm,test_x,test_y) print("Lstm 保存成功") model_E_TextCNN = TextCNN() model_name="model_E_TextCNN" model_E_TextCNN=model_fit(model_E_TextCNN) model_E_TextCNN.save("save_model/"+model_name,include_optimizer=False) model_evaluate(model_E_TextCNN,test_x,test_y) print("TextCNN 保存成功")
train_zi_model
# -*- coding: utf-8 -*- from pandas import read_csv import re import datetime import jieba_fast as jieba import pickle import numpy as np from keras.utils import np_utils from keras.layers import Embedding size=100 embedding_dim=size max_len=88 path_of_fstop='D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/tool_packet_and_content/hlt_stop_words.txt' path_of_train_data="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/train_set_450000.csv" path_of_test_data="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/validation_set_50000.csv" '''=================================================读取停用词=======================================''' stpwrdlst = {} fstop = open(path_of_fstop, encoding='utf-8', errors='ignore') for eachWord in fstop: eachWord=eachWord.strip("\n") stpwrdlst[eachWord] = eachWord fstop.close() print("导入停用词成功!!!") '''============================================导入训练数据和验证数据模块============================''' root1=path_of_train_data root2=path_of_test_data traindata1=read_csv(root1).values[:,1:] testdata=read_csv(root2).values[:,:] traindata2=np.vstack((traindata1,testdata)) print("导入数据成功!!!") '''============================================分字模块============================''' pre_time=datetime.datetime.now() trainci=[] for line in traindata2: wenben=line[0] results = re.sub('[’!"【】★#$%&\'!!+/·、()一“”,。()!《》!!!*+,-./\::;<=>??@[][\\]^_`{|}~]+',' ',wenben) ls=[] for zi in results: if (zi!=' ') and (zi not in stpwrdlst) and zi.isspace()==False: ls.append(zi) trainci.append(ls) post_time=datetime.datetime.now() print("训练集分字结束,时间为: ",(post_time-pre_time).seconds*1.0) pre_time=datetime.datetime.now() testci=[] for line in testdata: wenben=line[0] results = re.sub('[’!"【】★#$%&\'!!+/·、()一“”,。()!《》!!!*+,-./\::;<=>??@[][\\]^_`{|}~]+',' ',wenben) ls=[] for zi in results: if (zi!=' ') and (zi not in stpwrdlst) and zi.isspace()==False: ls.append(zi) testci.append(ls) post_time=datetime.datetime.now() print("验证集分词结束,时间为: ",(post_time-pre_time).seconds*1.0) print("分词结束!!!") '''=======================导入W2V的model和ci_dic,准备输入数据============================''' pre_time=datetime.datetime.now() from gensim.models import word2vec w2v_model = word2vec.Word2Vec.load("save_model/W2V_ZI.model") with open('save_model/ZI_dic.pkl', 'rb') as f: ci_dic=pickle.load(f) post_time=datetime.datetime.now() print("导入w2v完毕,时间为: ",(post_time-pre_time).seconds*1.0) '''=======================建立训练数据和测试数据============================''' pre_time=datetime.datetime.now() train_x=[] test_x=[] for line in trainci: ls=[] for ci in line: if ci in ci_dic: ls.append(ci_dic[ci]) train_x.append(ls) for line in testci: ls=[] for ci in line: if ci in ci_dic: ls.append(ci_dic[ci]) test_x.append(ls) train_x2=[] for ls in train_x: train_x2.append([0]*(max_len - len(ls))+ls) train_x=np.array(train_x2) test_x2=[] for ls in test_x: test_x2.append([0]*(max_len - len(ls))+ls) test_x=np.array(test_x2) post_time=datetime.datetime.now() print("输入数据处理完毕,时间为: ",(post_time-pre_time).seconds*1.0) embedding_matrix = np.zeros((len(ci_dic) + 1, embedding_dim)) for word, i in ci_dic.items(): if word in w2v_model: embedding_matrix[i] = np.asarray(w2v_model[word], dtype='float32') embedding_layer = Embedding(len(ci_dic) + 1, embedding_dim, weights=[embedding_matrix],# 表示直接使用预训练的词向量 input_length=max_len, trainable=True)# False表示不对词向量微调 '''=========================导入label,处理标签===============================''' with open('save_model/Label.pkl', 'rb') as f: Label=pickle.load(f) k=len(Label) pre_time=datetime.datetime.now() trainy=traindata2[:,1:] testy=testdata[:,1:] train_y=[] for line in trainy: ls=[] label=str(line[0])+"--"+str(line[1])+"--"+str(line[2]) ls.append(label) train_y.append(ls) test_y=[] for line in testy: ls=[] label=str(line[0])+"--"+str(line[1])+"--"+str(line[2]) ls.append(label) test_y.append(ls) with open('save_model/Label.pkl', 'rb') as f: Label=pickle.load(f) k=len(Label) train_y2=[] for la in train_y: la=la[0] train_y2.append(Label[la]) test_y2=[] for la in test_y: la=la[0] test_y2.append(Label[la]) train_y2=np.array(train_y2) test_y2=np.array(test_y2) '''把标签变为k个长度,若为1,则在1处为1,剩下的都标为0,k为标签个数''' train_y = np_utils.to_categorical(train_y2, num_classes=k) test_y = np_utils.to_categorical(test_y2,num_classes=k) post_time=datetime.datetime.now() print("标签处理完毕,时间为: ",(post_time-pre_time).seconds*1.0) '''=========================★★★★★★★建立五个模型★★★★★★★===============================''' from keras.models import Sequential,Model#按层 from keras.layers import Dense,LSTM,GlobalAveragePooling1D,Bidirectional,Activation,BatchNormalization, Flatten,Dropout,Conv1D,MaxPooling1D,concatenate,Input import matplotlib.pyplot as plt import keras from keras.callbacks import ReduceLROnPlateau,EarlyStopping from keras import regularizers def Fasttext(): model = Sequential() model.add(embedding_layer) model.add(GlobalAveragePooling1D()) model.add(Dense(k, activation='softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) return model def Bi_lstm(): L2=1 model = Sequential() model.add(embedding_layer) model.add(Dropout(0.2)) model.add(Bidirectional(LSTM(256, return_sequences=True), merge_mode='concat')) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Flatten()) model.add(Dense(256,activation='relu',kernel_regularizer=regularizers.l2(L2))) model.add(BatchNormalization())#he_uniform model.add(Dense(k)) model.add(Activation('softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) #Adamax return model def TextCNN(): L2=0.01 seq = Input(shape=[max_len], name='x_seq') emb_comment = embedding_layer(seq) convs = [] filter_sizes=[7,5,3,1] for fsz in filter_sizes: l_conv = Conv1D(filters=256, kernel_size=fsz)(emb_comment) l_re = Activation('relu')(l_conv) l_pool = MaxPooling1D(max_len - fsz + 1)(l_re) l_pool = Flatten()(l_pool) convs.append(l_pool) merge = concatenate(convs, axis=1) merge=BatchNormalization()(merge) out= Dense(k,activation='softmax',kernel_regularizer=regularizers.l2(L2))(merge) model =Model([seq],out) model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(lr=0.0001),metrics=['acc']) return model def Lstm(): Adam_lr=0.001 l2=0.01 model = Sequential() model.add(embedding_layer) model.add(BatchNormalization()) model.add(LSTM(512, input_shape=(max_len,size))) model.add(Dense(512,activation='relu')) model.add(Dense(256,activation='relu')) model.add(BatchNormalization()) model.add(Dense(k,activation='softmax',kernel_regularizer=regularizers.l2(l2))) model.summary() model.compile(loss = 'categorical_crossentropy',optimizer=keras.optimizers.Adam(lr=Adam_lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False), metrics=['accuracy'],) return model def model_evaluate(model,test_x,test_y): print("\nTesting~~~~~~~~~~") loss,accuracy = model.evaluate(test_x,test_y) print('\ntest loss:',loss) print('\ntest accuracy:', accuracy) def model_fit(model): hist=model.fit(train_x,train_y, epochs=100, batch_size=128,validation_split=0.1,shuffle=True,callbacks=[reduce_lr,early_stop]) #训练2大批,每批32个 plt.plot(hist.history['loss'], label='train_loss') plt.plot(hist.history['val_loss'], label='val_loss') plt.legend() plt.show() return model Adam_lr=0.001 Min_delta=Adam_lr*1.0/10 reduce_lr =ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=0, mode='auto',min_delta=Min_delta, cooldown=0, min_lr=0) early_stop=EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='auto') model_E_Fasttext = Fasttext() model_name="model_E_Fasttext_zi" model_E_Fasttext=model_fit(model_E_Fasttext) model_E_Fasttext.save("save_model/"+model_name,include_optimizer=False) model_evaluate(model_E_Fasttext,test_x,test_y) print("E_Fasttext 分类器保存成功") model_E_TextCNN = TextCNN() model_name="model_E_TextCNN_zi" model_E_TextCNN=model_fit(model_E_TextCNN) model_E_TextCNN.save("save_model/"+model_name,include_optimizer=False) model_evaluate(model_E_TextCNN,test_x,test_y) print("TextCNN 保存成功")
get_dic_model
# -*- coding: utf-8 -*- from pandas import read_csv import re import datetime import jieba_fast as jieba import pickle '''============================================导入训练数据\验证数据\测试数据============================''' root1="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/train_set_450000.csv" root2="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/validation_set_50000.csv" root3="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/test.tsv" traindata=read_csv(root1).values[:,:] traindata=traindata[:,1:] traindata_x=traindata[:,0] valdata=read_csv(root2).values[:,:] valdata=valdata[:,:] valdata_x=valdata[:,0] f = open(root3,'r',encoding='UTF-8') testdata_x = list() for line in open(root3,'r',encoding='UTF-8'): line = f.readline() line=line.replace("\n","") testdata_x.append(line) f.close() #合并数据成500w,进行下面的训练。 dataset_x_all=[] for i in range(len(testdata_x)): dataset_x_all.append(testdata_x[i]) #dataset_x_all=testdata_x for i in range(len(traindata_x)): dataset_x_all.append(traindata_x[i]) for i in range(len(valdata_x)): dataset_x_all.append(valdata_x[i]) all_ci=[] size=120 embedding_dim=size '''=================================================读取停用词=======================================''' stpwrdlst = {} stopword_path="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/tool_packet_and_content/hlt_stop_words.txt" fstop = open(stopword_path, encoding='utf-8', errors='ignore') for eachWord in fstop: eachWord=eachWord.strip("\n") stpwrdlst[eachWord] = eachWord fstop.close() print("导入停用词成功!!!") '''============================================分词模块============================''' jieba.load_userdict('D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/easy_ciku5.txt') #载入词库 pre_time=datetime.datetime.now() train_all_x_ci=[] for line in dataset_x_all: wenben=line results = re.sub('[’!"【】★#$%&\'!!+/·、()一“”,。()!《》!!!*+,-./\::;<=>??@[][\\]^_`{|}~]+',' ',wenben) wenben=jieba.lcut(results,cut_all=False) for ci in wenben: if ci in stpwrdlst or ci.isspace() or ci==' ' or ci=='\x08': wenben.remove(ci) train_all_x_ci.append(wenben) all_ci.extend(wenben) post_time=datetime.datetime.now() print("训练集分词结束,时间为: ",(post_time-pre_time).seconds*1.0) # 337.0s '''=======================================建立词库===========================================''' all_ci=list(set(all_ci)) # 手动建立字典 ci_dic={} kk=1 for ci in all_ci: ci_dic[ci]=kk kk+=1 # 把字典存到本地 with open('save_model/CI_dic.pkl', 'wb') as f: pickle.dump(ci_dic, f, pickle.HIGHEST_PROTOCOL) print("词库字典保存成功!") '''====================================word2vec模型构建模块====================================''' pre_time=datetime.datetime.now() from gensim.models import word2vec model = word2vec.Word2Vec(sentences=train_all_x_ci, #sentences可以是分词列表,也可以是大语料 size=size,#特征向量的维度 alpha=0.04,#学习率 window=35,#一个句子内,当前词和预测词之间的最大距离 文本(window)大小:skip-gram通常在10附近,CBOW通常在5附近 min_count=0,#最低词频 没有大的变化 max_vocab_size=None, sample=0.0001, #随机下采样的阈值 seed=1,#随机数种子 workers=10,#进程数 min_alpha=0.00001,#学习率下降的最小值 sg=1, #训练算法的选择,sg=1,采用skip-gram,sg=0,采用CBOW---skip-gram(慢、对罕见字有利)vs CBOW(快) hs=1,# hs=1,采用hierarchica·softmax,hs=0,采用negative sampling #分层softmax(对罕见字有利)vs 负采样(对常见词和低纬向量有利) negative=0,#这个值大于0,使用negative sampling去掉'noise words'的个数(通常设置5-20);为0,不使用negative sampling #cbow_mean=1,#为0,使用词向量的和,为1,使用均值;只适用于cbow的情况 iter = 80,#迭代次数 null_word = 0, trim_rule = None, #裁剪词汇规则,使用None(会使用最小min_count) sorted_vocab =1,#对词汇降序排序 batch_words = 8192,#训练时,每一批次的单词数量 compute_loss = False, callbacks = ()) model.save("save_model/W2V_CI.model") # 保存模型 post_time=datetime.datetime.now() print("word2vec模型训练保存结束,时间为: ",(post_time-pre_time).seconds*1.0)#1106.0s '''======================================得到max_len===========================================''' max_len=0 train_x=[] for line in train_all_x_ci: ls=[] for ci in line: ls.append(ci_dic[ci]) max_len=max(max_len,len(ls)) train_x.append(ls) print("max_len是 ",max_len) '''===================================保存标签Label_dic和Re_Label_ci_dic========================''' pre_time=datetime.datetime.now() train_y=[] for i in range(len(traindata)): ls=[] label=str(traindata[i][1])+"--"+str(traindata[i][2])+"--"+str(traindata[i][3]) ls.append(label) train_y.append(ls) for i in range(len(valdata)): ls=[] label=str(valdata[i][1])+"--"+str(valdata[i][2])+"--"+str(valdata[i][3]) ls.append(label) train_y.append(ls) Label={} label_number=[] k=0 for la in train_y: la=la[0] if la not in Label: Label[la]=k k+=1 label_number.append(Label[la]) k=len(Label) with open('save_model/Label.pkl', 'wb') as f: pickle.dump(Label, f, pickle.HIGHEST_PROTOCOL) print("Label字典保存成功!") Re_Label=dict((map(reversed, Label.items()))) with open('save_model/Re_Label.pkl', 'wb') as f: pickle.dump(Re_Label, f, pickle.HIGHEST_PROTOCOL) print("Re_Label字典保存成功!") print("max_len是 ",max_len)
get_zi_dic_model
# -*- coding: utf-8 -*- from pandas import read_csv import re import datetime import pickle '''============================================导入训练数据\验证数据\测试数据============================''' root1="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/train_set_450000.csv" root2="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/validation_set_50000.csv" root3="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/test.tsv" traindata=read_csv(root1).values[:,:] traindata=traindata[:,1:] traindata_x=traindata[:,0] valdata=read_csv(root2).values[:,:] valdata=valdata[:,:] valdata_x=valdata[:,0] f = open(root3,'r',encoding='UTF-8') testdata_x = list() for line in open(root3,'r',encoding='UTF-8'): line = f.readline() line=line.replace("\n","") testdata_x.append(line) f.close() #合并数据成500w,进行下面的训练。 dataset_x_all=[] for i in range(len(testdata_x)): dataset_x_all.append(testdata_x[i]) for i in range(len(traindata_x)): dataset_x_all.append(traindata_x[i]) for i in range(len(valdata_x)): dataset_x_all.append(valdata_x[i]) all_ci=[] size=100 embedding_dim=size '''=================================================读取停用词=======================================''' stpwrdlst = {} stopword_path="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/tool_packet_and_content/hlt_stop_words.txt" fstop = open(stopword_path, encoding='utf-8', errors='ignore') for eachWord in fstop: eachWord=eachWord.strip("\n") stpwrdlst[eachWord] = eachWord fstop.close() print("导入停用词成功!!!") '''============================================分字模块============================''' pre_time=datetime.datetime.now() train_all_x_ci=[] for line in dataset_x_all: wenben=line results = re.sub('[’!"【】★#$%&\'!!+/·、()一“”,。()!《》!!!*+,-./\::;<=>??@[][\\]^_`{|}~]+',' ',wenben) ls=[] for zi in results: if (zi!=' ') and (zi not in stpwrdlst) and zi.isspace()==False: ls.append(zi) train_all_x_ci.append(ls) all_ci.extend(ls) post_time=datetime.datetime.now() print("训练集分字结束,时间为: ",(post_time-pre_time).seconds*1.0) '''=======================================建立ZI库===========================================''' all_ci=list(set(all_ci)) # 手动建立字典 ci_dic={} kk=1 for ci in all_ci: ci_dic[ci]=kk kk+=1 # 把字典存到本地 with open('save_model/ZI_dic.pkl', 'wb') as f: pickle.dump(ci_dic, f, pickle.HIGHEST_PROTOCOL) print("词库字典保存成功!") '''====================================word2vec模型构建模块====================================''' pre_time=datetime.datetime.now() from gensim.models import word2vec model = word2vec.Word2Vec(sentences=train_all_x_ci, #sentences可以是分词列表,也可以是大语料 size=size,#特征向量的维度 alpha=0.025,#学习率 window=14,#一个句子内,当前词和预测词之间的最大距离 min_count=0,#最低词频 没有大的变化 max_vocab_size=None,# sample=0.0001, #随机下采样的阈值 seed=1,#随机数种子 workers=10,#进程数 min_alpha=0.0001,#学习率下降的最小值 sg=1, #训练算法的选择,sg=1,采用skip-gram,sg=0,采用CBOW hs=1,# hs=1,采用hierarchica·softmax,hs=0,采用negative sampling negative=10,#这个值大于0,使用negative sampling去掉'noise words'的个数(通常设置5-20);为0,不使用negative sampling #cbow_mean=1,#为0,使用词向量的和,为1,使用均值;只适用于cbow的情况 iter = 5,#迭代次数 null_word = 0, trim_rule = None, #裁剪词汇规则,使用None(会使用最小min_count) sorted_vocab =1,#对词汇降序排序 batch_words = 10000,#训练时,每一批次的单词数量 compute_loss = False, callbacks = ()) model.save("save_model/W2V_ZI.model") # 保存模型 post_time=datetime.datetime.now() print("word2vec模型训练保存结束,时间为: ",(post_time-pre_time).seconds*1.0) '''======================================得到max_len===========================================''' max_len=0 train_x=[] for line in train_all_x_ci: ls=[] for ci in line: ls.append(ci_dic[ci]) max_len=max(max_len,len(ls)) train_x.append(ls) print("max_len是 ",max_len)
out_put_result
# -*- coding: utf-8 -*- output_result.py DE_Ensemble.py from pandas import read_csv import re import datetime import jieba_fast as jieba from keras.utils import np_utils import numpy as np import pandas as pd import pickle import keras pre_time=datetime.datetime.now() size=120 max_len=47 path_of_fstop='D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/tool_packet_and_content/hlt_stop_words.txt' path_of_test_data="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/validation_set_50000.csv" path_of_ciku='D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/easy_ciku5.txt' #path_of_500_test="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/test.tsv" # 载入字典 with open('save_model/CI_dic.pkl', 'rb') as f: ci_dic=pickle.load(f) # 加载模型 with open('save_model/Re_Label.pkl', 'rb') as f: Re_Label=pickle.load(f) with open('save_model/Label.pkl', 'rb') as f: Label=pickle.load(f) from keras.models import load_model model = load_model('save_model/model_E_Bi_lstm') model.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) #Adamax model_name="model_E_Bi_lstm" model = load_model('save_model/model_E_Fasttext') model.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) model_name="model_E_Fasttext" model = load_model('save_model/model_E_lstm') model.compile(loss = 'categorical_crossentropy',optimizer=keras.optimizers.Adam(lr=Adam_lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False), metrics=['accuracy'],) model_name="model_E_lstm" model = load_model('save_model/model_E_TextCNN') model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(lr=0.0001),metrics=['acc']) model_name="model_E_TextCNN" '''=================================================读取停用词=======================================''' stpwrdlst = {} fstop = open(path_of_fstop, encoding='utf-8', errors='ignore') for eachWord in fstop: eachWord=eachWord.strip("\n") stpwrdlst[eachWord] = eachWord fstop.close() print("导入停用词成功!!!") '''============================================导入训练数据和验证数据模块============================''' root=path_of_test_data testdata=read_csv(root).values[:,:] print("导入数据成功!!!") '''============================================分词模块============================''' '''jieba分词载入词库''' jieba.load_userdict(path_of_ciku) #载入词库 pre_time=datetime.datetime.now() testci=[] for line in testdata: wenben=line[0] results = re.sub('[’!"【】★#$%&\'!!+/·、()一“”,。()!《》!!!*+,-./\::;<=>??@[][\\]^_`{|}~]+',' ',wenben) wenben=jieba.lcut(results,cut_all=False) for ci in wenben: if ci in stpwrdlst or ci.isspace(): wenben.remove(ci) testci.append(wenben) post_time=datetime.datetime.now() print("训练集分词结束,时间为: ",(post_time-pre_time).seconds*1.0) #建立测试数据的标签对照 k=len(Label) testy=testdata[:,1:] test_y=[] for line in testy: ls=[] label=str(line[0])+"-"+str(line[1])+"-"+str(line[2]) ls.append(label) test_y.append(ls) test_y2=[] for la in test_y: la=la[0] test_y2.append(Label[la]) test_y2=np.array(test_y2) test_y = np_utils.to_categorical(test_y2,num_classes=k) # 建立测试输入数据 test_x=[] for line in testci: ls=[] for ci in line: if ci in ci_dic: ls.append(ci_dic[ci]) test_x.append(ls) test_x2=[] for ls in test_x: test_x2.append([0]*(max_len-len(ls))+ls) test_x=np.array(test_x2) loss,accuracy = model.evaluate(test_x,test_y) print(accuracy) predict = model.predict(test_x) output=pd.DataFrame(predict) output.to_csv(model_name+".tsv")
out_put_result_zi
# -*- coding: utf-8 -*- output_result.py DE_Ensemble.py from pandas import read_csv import re import datetime import jieba_fast as jieba from keras.utils import np_utils import numpy as np import pandas as pd import pickle pre_time=datetime.datetime.now() import keras size=100 max_len=88 path_of_fstop='D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/tool_packet_and_content/hlt_stop_words.txt' path_of_test_data="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/validation_set_50000.csv" path_of_ciku='D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/easy_ciku5.txt' # 载入字典 with open('save_model/ZI_dic.pkl', 'rb') as f: ci_dic=pickle.load(f) # 加载模型 with open('save_model/Re_Label.pkl', 'rb') as f: Re_Label=pickle.load(f) with open('save_model/Label.pkl', 'rb') as f: Label=pickle.load(f) from keras.models import load_model model = load_model('save_model/model_E_TextCNN_zi') model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(lr=0.0001),metrics=['acc']) model_name="model_E_TextCNN_zi" model = load_model('save_model/model_E_Fasttext_zi') model.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) model_name="model_E_Fasttext_zi" '''=================================================读取停用词=======================================''' stpwrdlst = {} fstop = open(path_of_fstop, encoding='utf-8', errors='ignore') for eachWord in fstop: eachWord=eachWord.strip("\n") stpwrdlst[eachWord] = eachWord fstop.close() print("导入停用词成功!!!") '''============================================导入训练数据和验证数据模块============================''' root=path_of_test_data testdata=read_csv(root).values[:,:] print("导入数据成功!!!") '''============================================分词模块============================''' '''jieba分词载入词库''' jieba.load_userdict(path_of_ciku) #载入词库 pre_time=datetime.datetime.now() testci=[] for line in testdata: wenben=line[0] results = re.sub('[’!"【】★#$%&\'!!+/·、()一“”,。()!《》!!!*+,-./\::;<=>??@[][\\]^_`{|}~]+',' ',wenben) ls=[] for zi in results: if (zi!=' ') and (zi not in stpwrdlst) and zi.isspace()==False: ls.append(zi) testci.append(ls) post_time=datetime.datetime.now() print("训练集分词结束,时间为: ",(post_time-pre_time).seconds*1.0) #建立测试数据的标签对照 k=len(Label) testy=testdata[:,1:] test_y=[] for line in testy: ls=[] label=str(line[0])+"--"+str(line[1])+"--"+str(line[2]) ls.append(label) test_y.append(ls) test_y2=[] for la in test_y: la=la[0] test_y2.append(Label[la]) test_y2=np.array(test_y2) test_y = np_utils.to_categorical(test_y2,num_classes=k) # 建立测试输入数据 test_x=[] for line in testci: ls=[] for ci in line: if ci in ci_dic: ls.append(ci_dic[ci]) test_x.append(ls) test_x2=[] for ls in test_x: test_x2.append([0]*(max_len-len(ls))+ls) test_x=np.array(test_x2) loss,accuracy = model.evaluate(test_x,test_y) print(accuracy) predict = model.predict(test_x) output=pd.DataFrame(predict) output.to_csv(model_name+".csv")
DE_Ensemble
# -*- coding: utf-8 -*- output_result.py DE_Ensemble.py from pandas import read_csv import jieba_fast as jieba from keras.utils import np_utils import numpy as np import pandas as pd import pickle path_of_true_result="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/validation_set_50000.csv" path_of_fasttext_result="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/model_E_Fasttext.csv" path_of_bi_lstm_result="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/model_E_Bi_lstm.csv" path_of_lstm_result="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/model_E_lstm.csv" path_of_textcnn_result="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/model_E_TextCNN.csv" path_of_fasttext_zi_result="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/model_E_Fasttext_zi.csv" path_of_textcnn_zi_result="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/model_E_TextCNN_zi.csv" root=path_of_true_result true_result=read_csv(root).values[:,1:] test_y=[] for line in true_result: ls=[] label=str(line[0])+"-"+str(line[1])+"-"+str(line[2]) ls.append(label) test_y.append(ls) root=path_of_fasttext_result fasttext_result=read_csv(root).values[:,1:] fasttext_result=np.array(fasttext_result) root=path_of_fasttext_zi_result fasttext_zi_result=read_csv(root).values[:,1:] fasttextzi_zi_result=np.array(fasttext_zi_result) root=path_of_textcnn_result textcnn_result=read_csv(root).values[:,1:] textcnn_result=np.array(textcnn_result) root=path_of_textcnn_zi_result textcnn_zi_result=read_csv(root).values[:,1:] textcnn_zi_result=np.array(textcnn_zi_result) root=path_of_bi_lstm_result bi_lstm_result=read_csv(root).values[:,1:] bi_lstm_result=np.array(bi_lstm_result) root=path_of_lstm_result lstm_result=read_csv(root).values[:,1:] lstm_result=np.array(lstm_result) # 加载模型 with open('save_model/Re_Label.pkl', 'rb') as f: Re_Label=pickle.load(f) def evaluation(truedata,foredata): k=0 length=len(truedata) for i in range(length): if(truedata[i][0]==foredata[i]): k+=1 acc=k*1.0/length return acc population=[] def DE_function(): #对20个个体进行初始化操作 population2=[] population3=[] pop_acc=[] inv_num=50 iter_num =30 for i in range(inv_num): ind=[] fasttext_weight=np.random.random() bi_lstm_weight=np.random.random() lstm_weight=np.random.random() textcnn_weight=np.random.random() fasttext_zi_weight=np.random.random() textcnn_zi_weight=np.random.random() ind.append(fasttext_weight) ind.append(bi_lstm_weight) ind.append(lstm_weight) ind.append(textcnn_weight) ind.append(fasttext_zi_weight) ind.append(textcnn_zi_weight) population.append(ind) population2.append(ind) population3.append(ind) for i in range(inv_num):#每个个体计算适应度值 spring_population=[] for j in range(len(fasttext_result)): spring_ind0=population[i][0]*fasttext_result[j] spring_ind1=population[i][1]*textcnn_result[j] spring_ind2=population[i][2]*bi_lstm_result[j] spring_ind3=population[i][3]*lstm_result[j] spring_ind4=population[i][4]*fasttextzi_zi_result[j] spring_ind5=population[i][5]*textcnn_zi_result[j] spring=spring_ind0+spring_ind1+spring_ind2+spring_ind3+spring_ind4+spring_ind5 spring=spring.tolist() spring_population.append(spring) spring_population=np.argmax(spring_population,axis=1) k=0 Y_pre=[] for line in spring_population: Y_pre.append(Re_Label[spring_population[k]]) k+=1 res=evaluation(test_y,Y_pre) print(res) pop_acc.append(res) #************************************************************************ #20个个体进行差分进化' #进行迭代50次 while(iter_num>0): population2=[] population3=[] for i in range(inv_num): inv=[] for j in range(6): inv.append(population[i][j]) population2.append(inv) population3.append(inv) #20个个体实现变异操作 iter_num-=1 for i in range(inv_num): r1 = r2 = r3 = 0 while r1 == i or r2 == i or r3 == i or r2 == r1 or r3 == r1 or r3 == r2: r1 = np.random.randint(0, inv_num) # 随机数范围为[0,size-1]的整数 r2 = np.random.randint(0, inv_num) r3 = np.random.randint(0, inv_num) for j in range(6): population2[i][j]=population2[r1][j]+0.1*(population2[r2][j]-population2[r3][j]) if population2[i][j]>1 or population2[i][j]<0: #越界判断 population2[i][j]=np.random.random() #20个个体进行交叉操作 for i in range(inv_num): for j in range(6): r1=np.random.randint(0,6) r2=np.random.random() if r2<0.5 or r1==j: population3[i][j]=population2[i][j] else: population3[i][j]=population[i][j] print("第"+str(iter_num)+"代交叉操作结束") #择优操作 print("开始进行择优操作") for i in range(inv_num): acc1=0 fasttext_weight=float(population3[i][0]) textcnn_weight=float(population3[i][1]) bi_lstm_weight=float(population3[i][2]) lstm_weight=float(population3[i][3]) fasttext_zi_weight=float(population3[i][4]) textcnn_zi_weight=float(population3[i][5]) spring_population=[] for j in range(len(fasttext_result)): spring_ind0=population[i][0]*fasttext_result[j] spring_ind1=population[i][1]*textcnn_result[j] spring_ind2=population[i][2]*bi_lstm_result[j] spring_ind3=population[i][3]*lstm_result[j] spring_ind4=population[i][4]*fasttextzi_zi_result[j] spring_ind5=population[i][5]*textcnn_zi_result[j] spring=spring_ind0+spring_ind1+spring_ind2+spring_ind3+spring_ind4+spring_ind5 spring=spring.tolist() spring_population.append(spring) spring_population=np.argmax(spring_population,axis=1) k=0 Y_pre2=[] for line in spring_population: Y_pre2.append(Re_Label[spring_population[k]]) k+=1 acc1=evaluation(test_y,Y_pre2) print(acc1) print(population[i][0]) print(population[i][1]) print(population[i][2]) print(population[i][3]) print(population[i][4]) print(population[i][5]) if(acc1>pop_acc[i]): pop_acc[i]=acc1 for j in range(6): population[i][j]=population3[i][j] best_id=0 acc2=-100000 for i in range(inv_num): if pop_acc[i]>acc2: acc2=pop_acc[i] best_id=i print(acc2) return acc2,population[best_id][0],population[best_id][1],population[best_id][2],population[best_id][3],population[best_id][4],population[best_id][5] acc,w1,w2,w3,w4,w5,w6=DE_function() print("last acc") print(acc) print(w1) print(w2) print(w3) print(w4) print(w5) print(w6)
test_dunction
import re import datetime #import jieba_fast as jieba import jieba import numpy as np import pickle import keras from keras.models import load_model pre_time=datetime.datetime.now() size=120 max_len=47 path_of_fstop='static/other_need_file/hlt_stop_words.txt' path_of_ciku='static/other_need_file/easy_ciku5.txt' # 载入字典 with open('static/other_need_file/CI_dic.pkl', 'rb') as f: ci_dic=pickle.load(f) # 加载模型 with open('static/other_need_file/Re_Label.pkl', 'rb') as f: Re_Label=pickle.load(f) with open('static/other_need_file/Label.pkl', 'rb') as f: Label=pickle.load(f) '''=================================================读取停用词=======================================''' stpwrdlst = {} fstop = open(path_of_fstop, encoding='utf-8', errors='ignore') for eachWord in fstop: eachWord=eachWord.strip("\n") stpwrdlst[eachWord] = eachWord fstop.close() print("导入停用词成功!!!") '''============================================分词模块============================''' def class_good(testdata_x): '''jieba分词载入词库''' jieba.load_userdict(path_of_ciku) #载入词库 pre_time=datetime.datetime.now() testci=[] for line in testdata_x: wenben=line[0] results = re.sub('[’!"【】★#$%&\'!!+/·、()一“”,。()!《》!!!*+,-./\::;<=>??@[][\\]^_`{|}~]+',' ',wenben) wenben=jieba.lcut(results,cut_all=False) for ci in wenben: if ci in stpwrdlst or ci.isspace(): wenben.remove(ci) testci.append(wenben) post_time=datetime.datetime.now() print("训练集分词结束,时间为: ",(post_time-pre_time).seconds*1.0) # 建立测试输入数据 pre_time=datetime.datetime.now() test_x=[] for line in testci: ls=[] for ci in line: if ci in ci_dic: ls.append(ci_dic[ci]) test_x.append(ls) test_x2=[] for ls in test_x: test_x2.append([0]*(max_len-len(ls))+ls) test_x=np.array(test_x2) post_time=datetime.datetime.now() print("建立测试输入数据,时间为: ",(post_time-pre_time).seconds*1.0) '''这部分要移到下面来,因为导入模型后必须直接预测''' #各个模型进行分类预测 Adam_lr = 0.001 keras.backend.clear_session() model_E_Fasttext = load_model('save_model/model_E_Fasttext') model_E_Fasttext.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) model_E_Bi_lstm = load_model('save_model/model_E_Bi_lstm') model_E_Bi_lstm.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) model_E_lstm = load_model('save_model/model_E_lstm') model_E_lstm.compile(loss = 'categorical_crossentropy',optimizer=keras.optimizers.Adam(lr=Adam_lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False), metrics=['accuracy'],) model_E_TextCNN = load_model('save_model/model_E_TextCNN') model_E_TextCNN.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(lr=0.0001),metrics=['acc']) model_E_Fasttext_zi = load_model('save_model/model_E_Fasttext_zi') model_E_Fasttext_zi.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) model_E_TextCNN_zi = load_model('save_model/model_E_TextCNN_zi') model_E_TextCNN_zi.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(lr=0.0001),metrics=['acc']) model_name_E_TextCNN_zi="model_E_TextCNN_zi" #各个模型进行分类预测 predict_model_E_Fasttext = model_E_Fasttext.predict(test_x) predict_model_E_Bi_lstm = model_E_Bi_lstm.predict(test_x) predict_model_E_lstm = model_E_lstm.predict(test_x) predict_model_E_TextCNN = model_E_TextCNN.predict(test_x) predict_model_E_Fasttext_zi = model_E_Fasttext_zi.predict(test_x) predict_model_name_E_TextCNN_zi = model_name_E_TextCNN_zi.predict(test_x) predict=[] for i in range(len(predict_model_E_Fasttext)): pre1=predict_model_E_Fasttext[i]*0.8726803 pre2=predict_model_E_TextCNN[i]*0.9982083 pre3=predict_model_E_Bi_lstm[i]*0.1621914 pre4=predict_model_E_lstm[i]*0.9603058 pre5=predict_model_E_Fasttext_zi[i]*0.9507958 pre6=predict_model_name_E_TextCNN_zi[i]*0.7029668 pred=pre1+pre2+pre3+pre4+pre5+pre6 pred=pred.tolist() predict.append(pred) #求标签 predict2=np.argmax(predict,axis=1) Y_pre=[] k=0 for line in testdata_x: Y_pre.append(Re_Label[predict2[k]]) k+=1 print("预测完成!结果保存在Y_pre里") ind = {} for i in range(len(testdata_x)): if i <5: print(Y_pre[i]) ind[testdata_x[i]] = Y_pre[i] return ind
train_model
# -*- coding: utf-8 -*- from pandas import read_csv import re import datetime import jieba_fast as jieba import pickle import numpy as np from keras.utils import np_utils from keras.layers import Embedding size=120 embedding_dim=size max_len=47 path_of_fstop='D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/tool_packet_and_content/hlt_stop_words.txt' path_of_train_data="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/train_set_450000.csv" path_of_test_data="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/validation_set_50000.csv" path_of_add_data="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/add.csv" path_of_ciku='D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/easy_ciku5.txt' '''=================================================读取停用词=======================================''' stpwrdlst = {} fstop = open(path_of_fstop, encoding='utf-8', errors='ignore') for eachWord in fstop: eachWord=eachWord.strip("\n") stpwrdlst[eachWord] = eachWord fstop.close() print("导入停用词成功!!!") '''============================================导入训练数据和验证数据模块============================''' root1=path_of_train_data root2=path_of_test_data traindata1=read_csv(root1).values[:,1:] testdata=read_csv(root2).values[:,:] adddata=read_csv(path_of_add_data).values[:,:] traindata2=np.vstack((traindata1,testdata)) add_data=[] for line in adddata: if len(line[0])<90: ind=[] ind.append(line[0]) ind.append(line[1]) ind.append(line[2]) ind.append(line[3]) add_data.append(ind) add_data=np.array(add_data) traindata2=np.vstack((traindata2,add_data)) print("导入数据成功!!!") '''============================================分词模块============================''' '''jieba分词载入词库''' jieba.load_userdict(path_of_ciku) #载入词库 pre_time=datetime.datetime.now() trainci=[] for line in traindata2: wenben=line[0] results = re.sub('[’!"【】★#$%&\'!!+/·、()一“”,。()!《》!!!*+,-./\::;<=>??@[][\\]^_`{|}~]+',' ',wenben) wenben=jieba.lcut(results,cut_all=False) for ci in wenben: if ci in stpwrdlst or ci.isspace(): wenben.remove(ci) trainci.append(wenben) post_time=datetime.datetime.now() print("训练集分词结束,时间为: ",(post_time-pre_time).seconds*1.0) pre_time=datetime.datetime.now() testci=[] for line in testdata: wenben=line[0] results = re.sub('[’!"【】★#$%&\'!!+/·、()一“”,。()!《》!!!*+,-./\::;<=>??@[][\\]^_`{|}~]+',' ',wenben) wenben=jieba.lcut(results,cut_all=False) for ci in wenben: if ci in stpwrdlst or ci.isspace(): wenben.remove(ci) testci.append(wenben) post_time=datetime.datetime.now() print("验证集分词结束,时间为: ",(post_time-pre_time).seconds*1.0) print("分词结束!!!") '''=======================导入W2V的model和ci_dic,准备输入数据============================''' pre_time=datetime.datetime.now() from gensim.models import word2vec w2v_model = word2vec.Word2Vec.load("save_model/W2V_CI.model") with open('save_model/CI_dic.pkl', 'rb') as f: ci_dic=pickle.load(f) post_time=datetime.datetime.now() print("导入w2v完毕,时间为: ",(post_time-pre_time).seconds*1.0) '''=======================建立训练数据和测试数据============================''' pre_time=datetime.datetime.now() train_x=[] test_x=[] for line in trainci: ls=[] for ci in line: if ci in ci_dic: ls.append(ci_dic[ci]) if len(ls)==(max_len-1): break train_x.append(ls) for line in testci: ls=[] for ci in line: if ci in ci_dic: ls.append(ci_dic[ci]) test_x.append(ls) train_x2=[] for ls in train_x: train_x2.append([0]*(max_len - len(ls))+ls) train_x=np.array(train_x2) test_x2=[] for ls in test_x: test_x2.append([0]*(max_len - len(ls))+ls) test_x=np.array(test_x2) post_time=datetime.datetime.now() print("输入数据处理完毕,时间为: ",(post_time-pre_time).seconds*1.0) embedding_matrix = np.zeros((len(ci_dic) + 1, embedding_dim)) for word, i in ci_dic.items(): if word in w2v_model: embedding_matrix[i] = np.asarray(w2v_model[word], dtype='float32') embedding_layer = Embedding(len(ci_dic) + 1, embedding_dim, weights=[embedding_matrix],# 表示直接使用预训练的词向量 input_length=max_len, trainable=True)# False表示不对词向量微调 '''=========================导入label,处理标签===============================''' with open('save_model/Label.pkl', 'rb') as f: Label=pickle.load(f) k=len(Label) pre_time=datetime.datetime.now() trainy=traindata2[:,1:] testy=testdata[:,1:] train_y=[] for line in trainy: ls=[] label=str(line[0])+"--"+str(line[1])+"--"+str(line[2]) ls.append(label) train_y.append(ls) test_y=[] for line in testy: ls=[] label=str(line[0])+"--"+str(line[1])+"--"+str(line[2]) ls.append(label) test_y.append(ls) with open('save_model/Label.pkl', 'rb') as f: Label=pickle.load(f) k=len(Label) train_y2=[] for la in train_y: la=la[0] train_y2.append(Label[la]) test_y2=[] for la in test_y: la=la[0] test_y2.append(Label[la]) train_y2=np.array(train_y2) test_y2=np.array(test_y2) '''把标签变为k个长度,若为1,则在1处为1,剩下的都标为0,k为标签个数''' train_y = np_utils.to_categorical(train_y2, num_classes=k) test_y = np_utils.to_categorical(test_y2,num_classes=k) post_time=datetime.datetime.now() print("标签处理完毕,时间为: ",(post_time-pre_time).seconds*1.0) '''=========================★★★★★★★建立五个模型★★★★★★★===============================''' from keras.models import Sequential,Model#按层 from keras.layers import Dense,LSTM,GlobalAveragePooling1D,Bidirectional,Activation,BatchNormalization, Flatten,Dropout,Conv1D,MaxPooling1D,concatenate,Input import matplotlib.pyplot as plt import keras from keras.callbacks import ReduceLROnPlateau,EarlyStopping from keras import regularizers def Fasttext(): #loss: 0.0412 - acc: 0.9935 - val_loss: 0.6146 - val_acc: 0.8718 #test accuracy: 0.87298 model = Sequential() model.add(embedding_layer) model.add(GlobalAveragePooling1D()) model.add(Dense(k, activation='softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) return model def Bi_lstm(): L2=1 model = Sequential() model.add(embedding_layer) model.add(Dropout(0.2)) model.add(Bidirectional(LSTM(256, return_sequences=True), merge_mode='concat')) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Flatten()) model.add(Dense(256,activation='relu',kernel_regularizer=regularizers.l2(L2))) model.add(BatchNormalization()) model.add(Dense(k)) model.add(Activation('softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) #Adamax return model def TextCNN(): L2=0.01 seq = Input(shape=[max_len], name='x_seq') emb_comment = embedding_layer(seq) convs = [] filter_sizes=[7,5,3,1] for fsz in filter_sizes: l_conv = Conv1D(filters=256, kernel_size=fsz)(emb_comment) l_re = Activation('relu')(l_conv) l_pool = MaxPooling1D(max_len - fsz + 1)(l_re) l_pool = Flatten()(l_pool) convs.append(l_pool) merge = concatenate(convs, axis=1) merge=BatchNormalization()(merge) out= Dense(k,activation='softmax',kernel_regularizer=regularizers.l2(L2))(merge) model =Model([seq],out) model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(lr=0.0001),metrics=['acc']) return model def Lstm(): Adam_lr=0.001 l2=0.01 model = Sequential() model.add(embedding_layer) model.add(BatchNormalization()) model.add(LSTM(512, input_shape=(max_len,size))) model.add(Dense(512,activation='relu')) model.add(Dense(256,activation='relu')) model.add(BatchNormalization()) model.add(Dense(k,activation='softmax',kernel_regularizer=regularizers.l2(l2))) model.summary() model.compile(loss = 'categorical_crossentropy',optimizer=keras.optimizers.Adam(lr=Adam_lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False), metrics=['accuracy'],) return model def model_evaluate(model,test_x,test_y): print("\nTesting~~~~~~~~~~") loss,accuracy = model.evaluate(test_x,test_y) print('\ntest loss:',loss) print('\ntest accuracy:', accuracy) def model_fit(model): hist=model.fit(train_x,train_y, epochs=50, batch_size=128,validation_split=0.1,shuffle=True,callbacks=[reduce_lr,early_stop]) #训练2大批,每批32个 plt.plot(hist.history['loss'], label='train_loss') plt.plot(hist.history['val_loss'], label='val_loss') plt.legend() plt.show() return model Adam_lr=0.001 Min_delta=Adam_lr*1.0/10 reduce_lr =ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=0, mode='auto',min_delta=Min_delta, cooldown=0, min_lr=0.000001) early_stop=EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='auto') model_E_Fasttext = Fasttext() model_name="save_model/model_E_Fasttext" model_E_Fasttext=model_fit(model_E_Fasttext) model_E_Fasttext.save(model_name,include_optimizer=False) model_evaluate(model_E_Fasttext,test_x,test_y) model_E_Bi_lstm = Bi_lstm() model_name="model_E_Bi_lstm" model_E_Bi_lstm=model_fit(model_E_Bi_lstm) model_E_Bi_lstm.save("save_model/"+model_name,include_optimizer=False) model_evaluate(model_E_Bi_lstm,test_x,test_y) print("Bi_lstm 保存成功") model_E_Lstm =Lstm() model_name="model_E_lstm" model_E_Lstm=model_fit(model_E_Lstm) model_E_Lstm.save("save_model/"+model_name,include_optimizer=False) model_evaluate(model_E_Lstm,test_x,test_y) print("Lstm 保存成功") model_E_TextCNN = TextCNN() model_name="model_E_TextCNN" model_E_TextCNN=model_fit(model_E_TextCNN) model_E_TextCNN.save("save_model/"+model_name,include_optimizer=False) model_evaluate(model_E_TextCNN,test_x,test_y) print("TextCNN 保存成功")
train_zi_model
# -*- coding: utf-8 -*- from pandas import read_csv import re import datetime import jieba_fast as jieba import pickle import numpy as np from keras.utils import np_utils from keras.layers import Embedding size=100 embedding_dim=size max_len=88 path_of_fstop='D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/tool_packet_and_content/hlt_stop_words.txt' path_of_train_data="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/train_set_450000.csv" path_of_test_data="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/validation_set_50000.csv" '''=================================================读取停用词=======================================''' stpwrdlst = {} fstop = open(path_of_fstop, encoding='utf-8', errors='ignore') for eachWord in fstop: eachWord=eachWord.strip("\n") stpwrdlst[eachWord] = eachWord fstop.close() print("导入停用词成功!!!") '''============================================导入训练数据和验证数据模块============================''' root1=path_of_train_data root2=path_of_test_data traindata1=read_csv(root1).values[:,1:] testdata=read_csv(root2).values[:,:] traindata2=np.vstack((traindata1,testdata)) print("导入数据成功!!!") '''============================================分字模块============================''' pre_time=datetime.datetime.now() trainci=[] for line in traindata2: wenben=line[0] results = re.sub('[’!"【】★#$%&\'!!+/·、()一“”,。()!《》!!!*+,-./\::;<=>??@[][\\]^_`{|}~]+',' ',wenben) ls=[] for zi in results: if (zi!=' ') and (zi not in stpwrdlst) and zi.isspace()==False: ls.append(zi) trainci.append(ls) post_time=datetime.datetime.now() print("训练集分字结束,时间为: ",(post_time-pre_time).seconds*1.0) pre_time=datetime.datetime.now() testci=[] for line in testdata: wenben=line[0] results = re.sub('[’!"【】★#$%&\'!!+/·、()一“”,。()!《》!!!*+,-./\::;<=>??@[][\\]^_`{|}~]+',' ',wenben) ls=[] for zi in results: if (zi!=' ') and (zi not in stpwrdlst) and zi.isspace()==False: ls.append(zi) testci.append(ls) post_time=datetime.datetime.now() print("验证集分词结束,时间为: ",(post_time-pre_time).seconds*1.0) print("分词结束!!!") '''=======================导入W2V的model和ci_dic,准备输入数据============================''' pre_time=datetime.datetime.now() from gensim.models import word2vec w2v_model = word2vec.Word2Vec.load("save_model/W2V_ZI.model") with open('save_model/ZI_dic.pkl', 'rb') as f: ci_dic=pickle.load(f) post_time=datetime.datetime.now() print("导入w2v完毕,时间为: ",(post_time-pre_time).seconds*1.0) '''=======================建立训练数据和测试数据============================''' pre_time=datetime.datetime.now() train_x=[] test_x=[] for line in trainci: ls=[] for ci in line: if ci in ci_dic: ls.append(ci_dic[ci]) train_x.append(ls) for line in testci: ls=[] for ci in line: if ci in ci_dic: ls.append(ci_dic[ci]) test_x.append(ls) train_x2=[] for ls in train_x: train_x2.append([0]*(max_len - len(ls))+ls) train_x=np.array(train_x2) test_x2=[] for ls in test_x: test_x2.append([0]*(max_len - len(ls))+ls) test_x=np.array(test_x2) post_time=datetime.datetime.now() print("输入数据处理完毕,时间为: ",(post_time-pre_time).seconds*1.0) embedding_matrix = np.zeros((len(ci_dic) + 1, embedding_dim)) for word, i in ci_dic.items(): if word in w2v_model: embedding_matrix[i] = np.asarray(w2v_model[word], dtype='float32') embedding_layer = Embedding(len(ci_dic) + 1, embedding_dim, weights=[embedding_matrix],# 表示直接使用预训练的词向量 input_length=max_len, trainable=True)# False表示不对词向量微调 '''=========================导入label,处理标签===============================''' with open('save_model/Label.pkl', 'rb') as f: Label=pickle.load(f) k=len(Label) pre_time=datetime.datetime.now() trainy=traindata2[:,1:] testy=testdata[:,1:] train_y=[] for line in trainy: ls=[] label=str(line[0])+"--"+str(line[1])+"--"+str(line[2]) ls.append(label) train_y.append(ls) test_y=[] for line in testy: ls=[] label=str(line[0])+"--"+str(line[1])+"--"+str(line[2]) ls.append(label) test_y.append(ls) with open('save_model/Label.pkl', 'rb') as f: Label=pickle.load(f) k=len(Label) train_y2=[] for la in train_y: la=la[0] train_y2.append(Label[la]) test_y2=[] for la in test_y: la=la[0] test_y2.append(Label[la]) train_y2=np.array(train_y2) test_y2=np.array(test_y2) '''把标签变为k个长度,若为1,则在1处为1,剩下的都标为0,k为标签个数''' train_y = np_utils.to_categorical(train_y2, num_classes=k) test_y = np_utils.to_categorical(test_y2,num_classes=k) post_time=datetime.datetime.now() print("标签处理完毕,时间为: ",(post_time-pre_time).seconds*1.0) '''=========================★★★★★★★建立五个模型★★★★★★★===============================''' from keras.models import Sequential,Model#按层 from keras.layers import Dense,LSTM,GlobalAveragePooling1D,Bidirectional,Activation,BatchNormalization, Flatten,Dropout,Conv1D,MaxPooling1D,concatenate,Input import matplotlib.pyplot as plt import keras from keras.callbacks import ReduceLROnPlateau,EarlyStopping from keras import regularizers def Fasttext(): model = Sequential() model.add(embedding_layer) model.add(GlobalAveragePooling1D()) model.add(Dense(k, activation='softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) return model def Bi_lstm(): L2=1 model = Sequential() model.add(embedding_layer) model.add(Dropout(0.2)) model.add(Bidirectional(LSTM(256, return_sequences=True), merge_mode='concat')) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Flatten()) model.add(Dense(256,activation='relu',kernel_regularizer=regularizers.l2(L2))) model.add(BatchNormalization())#he_uniform model.add(Dense(k)) model.add(Activation('softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) #Adamax return model def TextCNN(): L2=0.01 seq = Input(shape=[max_len], name='x_seq') emb_comment = embedding_layer(seq) convs = [] filter_sizes=[7,5,3,1] for fsz in filter_sizes: l_conv = Conv1D(filters=256, kernel_size=fsz)(emb_comment) l_re = Activation('relu')(l_conv) l_pool = MaxPooling1D(max_len - fsz + 1)(l_re) l_pool = Flatten()(l_pool) convs.append(l_pool) merge = concatenate(convs, axis=1) merge=BatchNormalization()(merge) out= Dense(k,activation='softmax',kernel_regularizer=regularizers.l2(L2))(merge) model =Model([seq],out) model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(lr=0.0001),metrics=['acc']) return model def Lstm(): Adam_lr=0.001 l2=0.01 model = Sequential() model.add(embedding_layer) model.add(BatchNormalization()) model.add(LSTM(512, input_shape=(max_len,size))) model.add(Dense(512,activation='relu')) model.add(Dense(256,activation='relu')) model.add(BatchNormalization()) model.add(Dense(k,activation='softmax',kernel_regularizer=regularizers.l2(l2))) model.summary() model.compile(loss = 'categorical_crossentropy',optimizer=keras.optimizers.Adam(lr=Adam_lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False), metrics=['accuracy'],) return model def model_evaluate(model,test_x,test_y): print("\nTesting~~~~~~~~~~") loss,accuracy = model.evaluate(test_x,test_y) print('\ntest loss:',loss) print('\ntest accuracy:', accuracy) def model_fit(model): hist=model.fit(train_x,train_y, epochs=100, batch_size=128,validation_split=0.1,shuffle=True,callbacks=[reduce_lr,early_stop]) #训练2大批,每批32个 plt.plot(hist.history['loss'], label='train_loss') plt.plot(hist.history['val_loss'], label='val_loss') plt.legend() plt.show() return model Adam_lr=0.001 Min_delta=Adam_lr*1.0/10 reduce_lr =ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=0, mode='auto',min_delta=Min_delta, cooldown=0, min_lr=0) early_stop=EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='auto') model_E_Fasttext = Fasttext() model_name="model_E_Fasttext_zi" model_E_Fasttext=model_fit(model_E_Fasttext) model_E_Fasttext.save("save_model/"+model_name,include_optimizer=False) model_evaluate(model_E_Fasttext,test_x,test_y) print("E_Fasttext 分类器保存成功") model_E_TextCNN = TextCNN() model_name="model_E_TextCNN_zi" model_E_TextCNN=model_fit(model_E_TextCNN) model_E_TextCNN.save("save_model/"+model_name,include_optimizer=False) model_evaluate(model_E_TextCNN,test_x,test_y) print("TextCNN 保存成功")
get_dic_model
# -*- coding: utf-8 -*- from pandas import read_csv import re import datetime import jieba_fast as jieba import pickle '''============================================导入训练数据\验证数据\测试数据============================''' root1="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/train_set_450000.csv" root2="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/validation_set_50000.csv" root3="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/test.tsv" traindata=read_csv(root1).values[:,:] traindata=traindata[:,1:] traindata_x=traindata[:,0] valdata=read_csv(root2).values[:,:] valdata=valdata[:,:] valdata_x=valdata[:,0] f = open(root3,'r',encoding='UTF-8') testdata_x = list() for line in open(root3,'r',encoding='UTF-8'): line = f.readline() line=line.replace("\n","") testdata_x.append(line) f.close() #合并数据成500w,进行下面的训练。 dataset_x_all=[] for i in range(len(testdata_x)): dataset_x_all.append(testdata_x[i]) #dataset_x_all=testdata_x for i in range(len(traindata_x)): dataset_x_all.append(traindata_x[i]) for i in range(len(valdata_x)): dataset_x_all.append(valdata_x[i]) all_ci=[] size=120 embedding_dim=size '''=================================================读取停用词=======================================''' stpwrdlst = {} stopword_path="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/tool_packet_and_content/hlt_stop_words.txt" fstop = open(stopword_path, encoding='utf-8', errors='ignore') for eachWord in fstop: eachWord=eachWord.strip("\n") stpwrdlst[eachWord] = eachWord fstop.close() print("导入停用词成功!!!") '''============================================分词模块============================''' jieba.load_userdict('D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/easy_ciku5.txt') #载入词库 pre_time=datetime.datetime.now() train_all_x_ci=[] for line in dataset_x_all: wenben=line results = re.sub('[’!"【】★#$%&\'!!+/·、()一“”,。()!《》!!!*+,-./\::;<=>??@[][\\]^_`{|}~]+',' ',wenben) wenben=jieba.lcut(results,cut_all=False) for ci in wenben: if ci in stpwrdlst or ci.isspace() or ci==' ' or ci=='\x08': wenben.remove(ci) train_all_x_ci.append(wenben) all_ci.extend(wenben) post_time=datetime.datetime.now() print("训练集分词结束,时间为: ",(post_time-pre_time).seconds*1.0) # 337.0s '''=======================================建立词库===========================================''' all_ci=list(set(all_ci)) # 手动建立字典 ci_dic={} kk=1 for ci in all_ci: ci_dic[ci]=kk kk+=1 # 把字典存到本地 with open('save_model/CI_dic.pkl', 'wb') as f: pickle.dump(ci_dic, f, pickle.HIGHEST_PROTOCOL) print("词库字典保存成功!") '''====================================word2vec模型构建模块====================================''' pre_time=datetime.datetime.now() from gensim.models import word2vec model = word2vec.Word2Vec(sentences=train_all_x_ci, #sentences可以是分词列表,也可以是大语料 size=size,#特征向量的维度 alpha=0.04,#学习率 window=35,#一个句子内,当前词和预测词之间的最大距离 文本(window)大小:skip-gram通常在10附近,CBOW通常在5附近 min_count=0,#最低词频 没有大的变化 max_vocab_size=None, sample=0.0001, #随机下采样的阈值 seed=1,#随机数种子 workers=10,#进程数 min_alpha=0.00001,#学习率下降的最小值 sg=1, #训练算法的选择,sg=1,采用skip-gram,sg=0,采用CBOW---skip-gram(慢、对罕见字有利)vs CBOW(快) hs=1,# hs=1,采用hierarchica·softmax,hs=0,采用negative sampling #分层softmax(对罕见字有利)vs 负采样(对常见词和低纬向量有利) negative=0,#这个值大于0,使用negative sampling去掉'noise words'的个数(通常设置5-20);为0,不使用negative sampling #cbow_mean=1,#为0,使用词向量的和,为1,使用均值;只适用于cbow的情况 iter = 80,#迭代次数 null_word = 0, trim_rule = None, #裁剪词汇规则,使用None(会使用最小min_count) sorted_vocab =1,#对词汇降序排序 batch_words = 8192,#训练时,每一批次的单词数量 compute_loss = False, callbacks = ()) model.save("save_model/W2V_CI.model") # 保存模型 post_time=datetime.datetime.now() print("word2vec模型训练保存结束,时间为: ",(post_time-pre_time).seconds*1.0)#1106.0s '''======================================得到max_len===========================================''' max_len=0 train_x=[] for line in train_all_x_ci: ls=[] for ci in line: ls.append(ci_dic[ci]) max_len=max(max_len,len(ls)) train_x.append(ls) print("max_len是 ",max_len) '''===================================保存标签Label_dic和Re_Label_ci_dic========================''' pre_time=datetime.datetime.now() train_y=[] for i in range(len(traindata)): ls=[] label=str(traindata[i][1])+"--"+str(traindata[i][2])+"--"+str(traindata[i][3]) ls.append(label) train_y.append(ls) for i in range(len(valdata)): ls=[] label=str(valdata[i][1])+"--"+str(valdata[i][2])+"--"+str(valdata[i][3]) ls.append(label) train_y.append(ls) Label={} label_number=[] k=0 for la in train_y: la=la[0] if la not in Label: Label[la]=k k+=1 label_number.append(Label[la]) k=len(Label) with open('save_model/Label.pkl', 'wb') as f: pickle.dump(Label, f, pickle.HIGHEST_PROTOCOL) print("Label字典保存成功!") Re_Label=dict((map(reversed, Label.items()))) with open('save_model/Re_Label.pkl', 'wb') as f: pickle.dump(Re_Label, f, pickle.HIGHEST_PROTOCOL) print("Re_Label字典保存成功!") print("max_len是 ",max_len)
get_zi_dic_model
# -*- coding: utf-8 -*- from pandas import read_csv import re import datetime import pickle '''============================================导入训练数据\验证数据\测试数据============================''' root1="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/train_set_450000.csv" root2="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/validation_set_50000.csv" root3="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/test.tsv" traindata=read_csv(root1).values[:,:] traindata=traindata[:,1:] traindata_x=traindata[:,0] valdata=read_csv(root2).values[:,:] valdata=valdata[:,:] valdata_x=valdata[:,0] f = open(root3,'r',encoding='UTF-8') testdata_x = list() for line in open(root3,'r',encoding='UTF-8'): line = f.readline() line=line.replace("\n","") testdata_x.append(line) f.close() #合并数据成500w,进行下面的训练。 dataset_x_all=[] for i in range(len(testdata_x)): dataset_x_all.append(testdata_x[i]) for i in range(len(traindata_x)): dataset_x_all.append(traindata_x[i]) for i in range(len(valdata_x)): dataset_x_all.append(valdata_x[i]) all_ci=[] size=100 embedding_dim=size '''=================================================读取停用词=======================================''' stpwrdlst = {} stopword_path="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/tool_packet_and_content/hlt_stop_words.txt" fstop = open(stopword_path, encoding='utf-8', errors='ignore') for eachWord in fstop: eachWord=eachWord.strip("\n") stpwrdlst[eachWord] = eachWord fstop.close() print("导入停用词成功!!!") '''============================================分字模块============================''' pre_time=datetime.datetime.now() train_all_x_ci=[] for line in dataset_x_all: wenben=line results = re.sub('[’!"【】★#$%&\'!!+/·、()一“”,。()!《》!!!*+,-./\::;<=>??@[][\\]^_`{|}~]+',' ',wenben) ls=[] for zi in results: if (zi!=' ') and (zi not in stpwrdlst) and zi.isspace()==False: ls.append(zi) train_all_x_ci.append(ls) all_ci.extend(ls) post_time=datetime.datetime.now() print("训练集分字结束,时间为: ",(post_time-pre_time).seconds*1.0) '''=======================================建立ZI库===========================================''' all_ci=list(set(all_ci)) # 手动建立字典 ci_dic={} kk=1 for ci in all_ci: ci_dic[ci]=kk kk+=1 # 把字典存到本地 with open('save_model/ZI_dic.pkl', 'wb') as f: pickle.dump(ci_dic, f, pickle.HIGHEST_PROTOCOL) print("词库字典保存成功!") '''====================================word2vec模型构建模块====================================''' pre_time=datetime.datetime.now() from gensim.models import word2vec model = word2vec.Word2Vec(sentences=train_all_x_ci, #sentences可以是分词列表,也可以是大语料 size=size,#特征向量的维度 alpha=0.025,#学习率 window=14,#一个句子内,当前词和预测词之间的最大距离 min_count=0,#最低词频 没有大的变化 max_vocab_size=None,# sample=0.0001, #随机下采样的阈值 seed=1,#随机数种子 workers=10,#进程数 min_alpha=0.0001,#学习率下降的最小值 sg=1, #训练算法的选择,sg=1,采用skip-gram,sg=0,采用CBOW hs=1,# hs=1,采用hierarchica·softmax,hs=0,采用negative sampling negative=10,#这个值大于0,使用negative sampling去掉'noise words'的个数(通常设置5-20);为0,不使用negative sampling #cbow_mean=1,#为0,使用词向量的和,为1,使用均值;只适用于cbow的情况 iter = 5,#迭代次数 null_word = 0, trim_rule = None, #裁剪词汇规则,使用None(会使用最小min_count) sorted_vocab =1,#对词汇降序排序 batch_words = 10000,#训练时,每一批次的单词数量 compute_loss = False, callbacks = ()) model.save("save_model/W2V_ZI.model") # 保存模型 post_time=datetime.datetime.now() print("word2vec模型训练保存结束,时间为: ",(post_time-pre_time).seconds*1.0) '''======================================得到max_len===========================================''' max_len=0 train_x=[] for line in train_all_x_ci: ls=[] for ci in line: ls.append(ci_dic[ci]) max_len=max(max_len,len(ls)) train_x.append(ls) print("max_len是 ",max_len)
out_put_result
# -*- coding: utf-8 -*- output_result.py DE_Ensemble.py from pandas import read_csv import re import datetime import jieba_fast as jieba from keras.utils import np_utils import numpy as np import pandas as pd import pickle import keras pre_time=datetime.datetime.now() size=120 max_len=47 path_of_fstop='D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/tool_packet_and_content/hlt_stop_words.txt' path_of_test_data="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/validation_set_50000.csv" path_of_ciku='D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/easy_ciku5.txt' #path_of_500_test="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/test.tsv" # 载入字典 with open('save_model/CI_dic.pkl', 'rb') as f: ci_dic=pickle.load(f) # 加载模型 with open('save_model/Re_Label.pkl', 'rb') as f: Re_Label=pickle.load(f) with open('save_model/Label.pkl', 'rb') as f: Label=pickle.load(f) from keras.models import load_model model = load_model('save_model/model_E_Bi_lstm') model.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) #Adamax model_name="model_E_Bi_lstm" model = load_model('save_model/model_E_Fasttext') model.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) model_name="model_E_Fasttext" model = load_model('save_model/model_E_lstm') model.compile(loss = 'categorical_crossentropy',optimizer=keras.optimizers.Adam(lr=Adam_lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False), metrics=['accuracy'],) model_name="model_E_lstm" model = load_model('save_model/model_E_TextCNN') model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(lr=0.0001),metrics=['acc']) model_name="model_E_TextCNN" '''=================================================读取停用词=======================================''' stpwrdlst = {} fstop = open(path_of_fstop, encoding='utf-8', errors='ignore') for eachWord in fstop: eachWord=eachWord.strip("\n") stpwrdlst[eachWord] = eachWord fstop.close() print("导入停用词成功!!!") '''============================================导入训练数据和验证数据模块============================''' root=path_of_test_data testdata=read_csv(root).values[:,:] print("导入数据成功!!!") '''============================================分词模块============================''' '''jieba分词载入词库''' jieba.load_userdict(path_of_ciku) #载入词库 pre_time=datetime.datetime.now() testci=[] for line in testdata: wenben=line[0] results = re.sub('[’!"【】★#$%&\'!!+/·、()一“”,。()!《》!!!*+,-./\::;<=>??@[][\\]^_`{|}~]+',' ',wenben) wenben=jieba.lcut(results,cut_all=False) for ci in wenben: if ci in stpwrdlst or ci.isspace(): wenben.remove(ci) testci.append(wenben) post_time=datetime.datetime.now() print("训练集分词结束,时间为: ",(post_time-pre_time).seconds*1.0) #建立测试数据的标签对照 k=len(Label) testy=testdata[:,1:] test_y=[] for line in testy: ls=[] label=str(line[0])+"-"+str(line[1])+"-"+str(line[2]) ls.append(label) test_y.append(ls) test_y2=[] for la in test_y: la=la[0] test_y2.append(Label[la]) test_y2=np.array(test_y2) test_y = np_utils.to_categorical(test_y2,num_classes=k) # 建立测试输入数据 test_x=[] for line in testci: ls=[] for ci in line: if ci in ci_dic: ls.append(ci_dic[ci]) test_x.append(ls) test_x2=[] for ls in test_x: test_x2.append([0]*(max_len-len(ls))+ls) test_x=np.array(test_x2) loss,accuracy = model.evaluate(test_x,test_y) print(accuracy) predict = model.predict(test_x) output=pd.DataFrame(predict) output.to_csv(model_name+".tsv")
out_put_result_zi
# -*- coding: utf-8 -*- output_result.py DE_Ensemble.py from pandas import read_csv import re import datetime import jieba_fast as jieba from keras.utils import np_utils import numpy as np import pandas as pd import pickle pre_time=datetime.datetime.now() import keras size=100 max_len=88 path_of_fstop='D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/tool_packet_and_content/hlt_stop_words.txt' path_of_test_data="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/validation_set_50000.csv" path_of_ciku='D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/easy_ciku5.txt' # 载入字典 with open('save_model/ZI_dic.pkl', 'rb') as f: ci_dic=pickle.load(f) # 加载模型 with open('save_model/Re_Label.pkl', 'rb') as f: Re_Label=pickle.load(f) with open('save_model/Label.pkl', 'rb') as f: Label=pickle.load(f) from keras.models import load_model model = load_model('save_model/model_E_TextCNN_zi') model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(lr=0.0001),metrics=['acc']) model_name="model_E_TextCNN_zi" model = load_model('save_model/model_E_Fasttext_zi') model.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) model_name="model_E_Fasttext_zi" '''=================================================读取停用词=======================================''' stpwrdlst = {} fstop = open(path_of_fstop, encoding='utf-8', errors='ignore') for eachWord in fstop: eachWord=eachWord.strip("\n") stpwrdlst[eachWord] = eachWord fstop.close() print("导入停用词成功!!!") '''============================================导入训练数据和验证数据模块============================''' root=path_of_test_data testdata=read_csv(root).values[:,:] print("导入数据成功!!!") '''============================================分词模块============================''' '''jieba分词载入词库''' jieba.load_userdict(path_of_ciku) #载入词库 pre_time=datetime.datetime.now() testci=[] for line in testdata: wenben=line[0] results = re.sub('[’!"【】★#$%&\'!!+/·、()一“”,。()!《》!!!*+,-./\::;<=>??@[][\\]^_`{|}~]+',' ',wenben) ls=[] for zi in results: if (zi!=' ') and (zi not in stpwrdlst) and zi.isspace()==False: ls.append(zi) testci.append(ls) post_time=datetime.datetime.now() print("训练集分词结束,时间为: ",(post_time-pre_time).seconds*1.0) #建立测试数据的标签对照 k=len(Label) testy=testdata[:,1:] test_y=[] for line in testy: ls=[] label=str(line[0])+"--"+str(line[1])+"--"+str(line[2]) ls.append(label) test_y.append(ls) test_y2=[] for la in test_y: la=la[0] test_y2.append(Label[la]) test_y2=np.array(test_y2) test_y = np_utils.to_categorical(test_y2,num_classes=k) # 建立测试输入数据 test_x=[] for line in testci: ls=[] for ci in line: if ci in ci_dic: ls.append(ci_dic[ci]) test_x.append(ls) test_x2=[] for ls in test_x: test_x2.append([0]*(max_len-len(ls))+ls) test_x=np.array(test_x2) loss,accuracy = model.evaluate(test_x,test_y) print(accuracy) predict = model.predict(test_x) output=pd.DataFrame(predict) output.to_csv(model_name+".csv")
DE_Ensemble
# -*- coding: utf-8 -*- output_result.py DE_Ensemble.py from pandas import read_csv import jieba_fast as jieba from keras.utils import np_utils import numpy as np import pandas as pd import pickle path_of_true_result="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/original_data/validation_set_50000.csv" path_of_fasttext_result="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/model_E_Fasttext.csv" path_of_bi_lstm_result="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/model_E_Bi_lstm.csv" path_of_lstm_result="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/model_E_lstm.csv" path_of_textcnn_result="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/model_E_TextCNN.csv" path_of_fasttext_zi_result="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/model_E_Fasttext_zi.csv" path_of_textcnn_zi_result="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/Competition/model_E_TextCNN_zi.csv" root=path_of_true_result true_result=read_csv(root).values[:,1:] test_y=[] for line in true_result: ls=[] label=str(line[0])+"-"+str(line[1])+"-"+str(line[2]) ls.append(label) test_y.append(ls) root=path_of_fasttext_result fasttext_result=read_csv(root).values[:,1:] fasttext_result=np.array(fasttext_result) root=path_of_fasttext_zi_result fasttext_zi_result=read_csv(root).values[:,1:] fasttextzi_zi_result=np.array(fasttext_zi_result) root=path_of_textcnn_result textcnn_result=read_csv(root).values[:,1:] textcnn_result=np.array(textcnn_result) root=path_of_textcnn_zi_result textcnn_zi_result=read_csv(root).values[:,1:] textcnn_zi_result=np.array(textcnn_zi_result) root=path_of_bi_lstm_result bi_lstm_result=read_csv(root).values[:,1:] bi_lstm_result=np.array(bi_lstm_result) root=path_of_lstm_result lstm_result=read_csv(root).values[:,1:] lstm_result=np.array(lstm_result) # 加载模型 with open('save_model/Re_Label.pkl', 'rb') as f: Re_Label=pickle.load(f) def evaluation(truedata,foredata): k=0 length=len(truedata) for i in range(length): if(truedata[i][0]==foredata[i]): k+=1 acc=k*1.0/length return acc population=[] def DE_function(): #对20个个体进行初始化操作 population2=[] population3=[] pop_acc=[] inv_num=50 iter_num =30 for i in range(inv_num): ind=[] fasttext_weight=np.random.random() bi_lstm_weight=np.random.random() lstm_weight=np.random.random() textcnn_weight=np.random.random() fasttext_zi_weight=np.random.random() textcnn_zi_weight=np.random.random() ind.append(fasttext_weight) ind.append(bi_lstm_weight) ind.append(lstm_weight) ind.append(textcnn_weight) ind.append(fasttext_zi_weight) ind.append(textcnn_zi_weight) population.append(ind) population2.append(ind) population3.append(ind) for i in range(inv_num):#每个个体计算适应度值 spring_population=[] for j in range(len(fasttext_result)): spring_ind0=population[i][0]*fasttext_result[j] spring_ind1=population[i][1]*textcnn_result[j] spring_ind2=population[i][2]*bi_lstm_result[j] spring_ind3=population[i][3]*lstm_result[j] spring_ind4=population[i][4]*fasttextzi_zi_result[j] spring_ind5=population[i][5]*textcnn_zi_result[j] spring=spring_ind0+spring_ind1+spring_ind2+spring_ind3+spring_ind4+spring_ind5 spring=spring.tolist() spring_population.append(spring) spring_population=np.argmax(spring_population,axis=1) k=0 Y_pre=[] for line in spring_population: Y_pre.append(Re_Label[spring_population[k]]) k+=1 res=evaluation(test_y,Y_pre) print(res) pop_acc.append(res) #************************************************************************ #20个个体进行差分进化' #进行迭代50次 while(iter_num>0): population2=[] population3=[] for i in range(inv_num): inv=[] for j in range(6): inv.append(population[i][j]) population2.append(inv) population3.append(inv) #20个个体实现变异操作 iter_num-=1 for i in range(inv_num): r1 = r2 = r3 = 0 while r1 == i or r2 == i or r3 == i or r2 == r1 or r3 == r1 or r3 == r2: r1 = np.random.randint(0, inv_num) # 随机数范围为[0,size-1]的整数 r2 = np.random.randint(0, inv_num) r3 = np.random.randint(0, inv_num) for j in range(6): population2[i][j]=population2[r1][j]+0.1*(population2[r2][j]-population2[r3][j]) if population2[i][j]>1 or population2[i][j]<0: #越界判断 population2[i][j]=np.random.random() #20个个体进行交叉操作 for i in range(inv_num): for j in range(6): r1=np.random.randint(0,6) r2=np.random.random() if r2<0.5 or r1==j: population3[i][j]=population2[i][j] else: population3[i][j]=population[i][j] print("第"+str(iter_num)+"代交叉操作结束") #择优操作 print("开始进行择优操作") for i in range(inv_num): acc1=0 fasttext_weight=float(population3[i][0]) textcnn_weight=float(population3[i][1]) bi_lstm_weight=float(population3[i][2]) lstm_weight=float(population3[i][3]) fasttext_zi_weight=float(population3[i][4]) textcnn_zi_weight=float(population3[i][5]) spring_population=[] for j in range(len(fasttext_result)): spring_ind0=population[i][0]*fasttext_result[j] spring_ind1=population[i][1]*textcnn_result[j] spring_ind2=population[i][2]*bi_lstm_result[j] spring_ind3=population[i][3]*lstm_result[j] spring_ind4=population[i][4]*fasttextzi_zi_result[j] spring_ind5=population[i][5]*textcnn_zi_result[j] spring=spring_ind0+spring_ind1+spring_ind2+spring_ind3+spring_ind4+spring_ind5 spring=spring.tolist() spring_population.append(spring) spring_population=np.argmax(spring_population,axis=1) k=0 Y_pre2=[] for line in spring_population: Y_pre2.append(Re_Label[spring_population[k]]) k+=1 acc1=evaluation(test_y,Y_pre2) print(acc1) print(population[i][0]) print(population[i][1]) print(population[i][2]) print(population[i][3]) print(population[i][4]) print(population[i][5]) if(acc1>pop_acc[i]): pop_acc[i]=acc1 for j in range(6): population[i][j]=population3[i][j] best_id=0 acc2=-100000 for i in range(inv_num): if pop_acc[i]>acc2: acc2=pop_acc[i] best_id=i print(acc2) return acc2,population[best_id][0],population[best_id][1],population[best_id][2],population[best_id][3],population[best_id][4],population[best_id][5] acc,w1,w2,w3,w4,w5,w6=DE_function() print("last acc") print(acc) print(w1) print(w2) print(w3) print(w4) print(w5) print(w6)
test_dunction
import re import datetime #import jieba_fast as jieba import jieba import numpy as np import pickle import keras from keras.models import load_model pre_time=datetime.datetime.now() size=120 max_len=47 path_of_fstop='static/other_need_file/hlt_stop_words.txt' path_of_ciku='static/other_need_file/easy_ciku5.txt' # 载入字典 with open('static/other_need_file/CI_dic.pkl', 'rb') as f: ci_dic=pickle.load(f) # 加载模型 with open('static/other_need_file/Re_Label.pkl', 'rb') as f: Re_Label=pickle.load(f) with open('static/other_need_file/Label.pkl', 'rb') as f: Label=pickle.load(f) '''=================================================读取停用词=======================================''' stpwrdlst = {} fstop = open(path_of_fstop, encoding='utf-8', errors='ignore') for eachWord in fstop: eachWord=eachWord.strip("\n") stpwrdlst[eachWord] = eachWord fstop.close() print("导入停用词成功!!!") '''============================================分词模块============================''' def class_good(testdata_x): '''jieba分词载入词库''' jieba.load_userdict(path_of_ciku) #载入词库 pre_time=datetime.datetime.now() testci=[] for line in testdata_x: wenben=line[0] results = re.sub('[’!"【】★#$%&\'!!+/·、()一“”,。()!《》!!!*+,-./\::;<=>??@[][\\]^_`{|}~]+',' ',wenben) wenben=jieba.lcut(results,cut_all=False) for ci in wenben: if ci in stpwrdlst or ci.isspace(): wenben.remove(ci) testci.append(wenben) post_time=datetime.datetime.now() print("训练集分词结束,时间为: ",(post_time-pre_time).seconds*1.0) # 建立测试输入数据 pre_time=datetime.datetime.now() test_x=[] for line in testci: ls=[] for ci in line: if ci in ci_dic: ls.append(ci_dic[ci]) test_x.append(ls) test_x2=[] for ls in test_x: test_x2.append([0]*(max_len-len(ls))+ls) test_x=np.array(test_x2) post_time=datetime.datetime.now() print("建立测试输入数据,时间为: ",(post_time-pre_time).seconds*1.0) '''这部分要移到下面来,因为导入模型后必须直接预测''' #各个模型进行分类预测 Adam_lr = 0.001 keras.backend.clear_session() model_E_Fasttext = load_model('save_model/model_E_Fasttext') model_E_Fasttext.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) model_E_Bi_lstm = load_model('save_model/model_E_Bi_lstm') model_E_Bi_lstm.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) model_E_lstm = load_model('save_model/model_E_lstm') model_E_lstm.compile(loss = 'categorical_crossentropy',optimizer=keras.optimizers.Adam(lr=Adam_lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False), metrics=['accuracy'],) model_E_TextCNN = load_model('save_model/model_E_TextCNN') model_E_TextCNN.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(lr=0.0001),metrics=['acc']) model_E_Fasttext_zi = load_model('save_model/model_E_Fasttext_zi') model_E_Fasttext_zi.compile(loss='categorical_crossentropy', optimizer='nadam',metrics=['acc']) model_E_TextCNN_zi = load_model('save_model/model_E_TextCNN_zi') model_E_TextCNN_zi.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(lr=0.0001),metrics=['acc']) model_name_E_TextCNN_zi="model_E_TextCNN_zi" #各个模型进行分类预测 predict_model_E_Fasttext = model_E_Fasttext.predict(test_x) predict_model_E_Bi_lstm = model_E_Bi_lstm.predict(test_x) predict_model_E_lstm = model_E_lstm.predict(test_x) predict_model_E_TextCNN = model_E_TextCNN.predict(test_x) predict_model_E_Fasttext_zi = model_E_Fasttext_zi.predict(test_x) predict_model_name_E_TextCNN_zi = model_name_E_TextCNN_zi.predict(test_x) predict=[] for i in range(len(predict_model_E_Fasttext)): pre1=predict_model_E_Fasttext[i]*0.8726803 pre2=predict_model_E_TextCNN[i]*0.9982083 pre3=predict_model_E_Bi_lstm[i]*0.1621914 pre4=predict_model_E_lstm[i]*0.9603058 pre5=predict_model_E_Fasttext_zi[i]*0.9507958 pre6=predict_model_name_E_TextCNN_zi[i]*0.7029668 pred=pre1+pre2+pre3+pre4+pre5+pre6 pred=pred.tolist() predict.append(pred) #求标签 predict2=np.argmax(predict,axis=1) Y_pre=[] k=0 for line in testdata_x: Y_pre.append(Re_Label[predict2[k]]) k+=1 print("预测完成!结果保存在Y_pre里") ind = {} for i in range(len(testdata_x)): if i <5: print(Y_pre[i]) ind[testdata_x[i]] = Y_pre[i] return ind