1 importjieba
2
3 #创建停用词列表
4 defstopwordslist():
5 stopwords = [line.strip() for line in open('chinsesstoptxt.txt',encoding='UTF-8').readlines()]
6 returnstopwords
7
8 #对句子进行中文分词
9 defseg_depart(sentence):
10 #对文档中的每一行进行中文分词
11 print("正在分词")
12 sentence_depart =jieba.cut(sentence.strip())
13 #创建一个停用词列表
14 stopwords =stopwordslist()
15 #输出结果为outstr
16 outstr = ''
17 #去停用词
18 for word insentence_depart:
19 if word not instopwords:
20 if word != '\t':
21 outstr +=word
22 outstr += " "
23 returnoutstr
24
25 #给出文档路径
26 filename = "Init.txt"
27 outfilename = "out.txt"
28 inputs = open(filename, 'r', encoding='UTF-8')
29 outputs = open(outfilename, 'w', encoding='UTF-8')
30
31 #将输出结果写入ou.txt中
32 for line ininputs:
33 line_seg =seg_depart(line)
34 outputs.write(line_seg + '\n')
35 print("-------------------正在分词和去停用词-----------")
36 outputs.close()
37 inputs.close()
38 print("删除停用词和分词成功!!!")