做个笔记
代码按照1 2 3 4的顺序进行即可:
1.py(corpus_segment.py)
[python] view plain copy
2.py(corpus2Bunch.py)
1. #!/usr/bin/env python
2. # -*- coding: UTF-8 -*-
3. """
4. @version: python2.7.8
5. @author: XiangguoSun
6. @contact: sunxiangguodut@qq.com
7. @file: corpus_segment.py
8. @time: 2017/2/5 15:28
9. @software: PyCharm
10. """
11. import sys
12. import os
13. import jieba
14. # 配置utf-8输出环境
15. reload(sys)
16. sys.setdefaultencoding('utf-8')
17. # 保存至文件
18. def savefile(savepath, content):
19. "wb") as fp:
20. fp.write(content)
21. '''''
22. 上面两行是python2.6以上版本增加的语法,省略了繁琐的文件close和try操作
23. 2.5版本需要from __future__ import with_statement
24. 新手可以参考这个链接来学习http://zhoutall.com/archives/325
25. '''
26. # 读取文件
27. def readfile(path):
28. "rb") as fp:
29. content = fp.read()
30. return content
31.
32. def corpus_segment(corpus_path, seg_path):
33. '''''
34. corpus_path是未分词语料库路径
35. seg_path是分词后语料库存储路径
36. '''
37. # 获取corpus_path下的所有子目录
38. '''''
39. 其中子目录的名字就是类别名,例如:
40. train_corpus/art/21.txt中,'train_corpus/'是corpus_path,'art'是catelist中的一个成员
41. '''
42.
43. # 获取每个目录(类别)下所有的文件
44. for mydir in catelist:
45. '''''
46. 这里mydir就是train_corpus/art/21.txt中的art(即catelist中的一个类别)
47. '''
48. "/" # 拼出分类子目录的路径如:train_corpus/art/
49. "/" # 拼出分词后存贮的对应目录路径如:train_corpus_seg/art/
50.
51. if not os.path.exists(seg_dir): # 是否存在分词目录,如果没有则创建该目录
52. os.makedirs(seg_dir)
53.
54. # 获取未分词语料库中某一类别中的所有文本
55. '''''
56. train_corpus/art/中的
57. 21.txt,
58. 22.txt,
59. 23.txt
60. ...
61. file_list=['21.txt','22.txt',...]
62. '''
63. for file_path in file_list: # 遍历类别目录下的所有文件
64. # 拼出文件名全路径如:train_corpus/art/21.txt
65. # 读取文件内容
66. '''''此时,content里面存贮的是原文本的所有字符,例如多余的空格、空行、回车等等,
67. 接下来,我们需要把这些无关痛痒的字符统统去掉,变成只有标点符号做间隔的紧凑的文本内容
68. '''
69. "\r\n", "") # 删除换行
70. " ", "")#删除空行、多余的空格
71. # 为文件内容分词
72. " ".join(content_seg)) # 将处理后的文件保存到分词后语料目录
73.
74. print "中文语料分词结束!!!"
75.
76. '''''
77. 如果你对if __name__=="__main__":这句不懂,可以参考下面的文章
78. http://imoyao.lofter.com/post/3492bc_bd0c4ce
79. 简单来说如果其他python文件调用这个文件的函数,或者把这个文件作为模块
80. 导入到你的工程中时,那么下面的代码将不会被执行,而如果单独在命令行中
81. 运行这个文件,或者在IDE(如pycharm)中运行这个文件时候,下面的代码才会运行。
82. 即,这部分代码相当于一个功能测试。
83. 如果你还没懂,建议你放弃IT这个行业。
84. '''
85. if __name__=="__main__":
86. #对训练集进行分词
87. "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train/" # 未分词分类语料库路径
88. "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_corpus_seg/" # 分词后分类语料库路径,本程序输出结果
89. corpus_segment(corpus_path,seg_path)
90.
91. #对测试集进行分词
92. "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/answer/" # 未分词分类语料库路径
93. "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_corpus_seg/" # 分词后分类语料库路径,本程序输出结果
94. corpus_segment(corpus_path,seg_path)
[python] view plain copy
1. #!/usr/bin/env python
2. # -*- coding: UTF-8 -*-
3. """
4. @version: python2.7.8
5. @author: XiangguoSun
6. @contact: sunxiangguodut@qq.com
7. @file: corpus2Bunch.py
8. @time: 2017/2/7 7:41
9. @software: PyCharm
10. """
11. import sys
12. reload(sys)
13. sys.setdefaultencoding('utf-8')
14. import os#python内置的包,用于进行文件目录操作,我们将会用到os.listdir函数
15. import cPickle as pickle#导入cPickle包并且取一个别名pickle
16. '''''
17. 事实上python中还有一个也叫作pickle的包,与这里的名字相同了,无所谓
18. 关于cPickle与pickle,请参考博主另一篇博文:
19. python核心模块之pickle和cPickle讲解
20.
21. 本文件代码下面会用到cPickle中的函数cPickle.dump
22. '''
23. from sklearn.datasets.base import Bunch
24. #这个您无需做过多了解,您只需要记住以后导入Bunch数据结构就像这样就可以了。
25. #今后的博文会对sklearn做更有针对性的讲解
26.
27.
28. def _readfile(path):
29. '''''读取文件'''
30. #函数名前面带一个_,是标识私有函数
31. # 仅仅用于标明而已,不起什么作用,
32. # 外面想调用还是可以调用,
33. # 只是增强了程序的可读性
34. "rb") as fp:#with as句法前面的代码已经多次介绍过,今后不再注释
35. content = fp.read()
36. return content
37.
38. def corpus2Bunch(wordbag_path,seg_path):
39. # 获取seg_path下的所有子目录,也就是分类信息
40. #创建一个Bunch实例
41. bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])
42. bunch.target_name.extend(catelist)
43. '''''
44. extend(addlist)是python list中的函数,意思是用新的list(addlist)去扩充
45. 原来的list
46. '''
47. # 获取每个目录下所有的文件
48. for mydir in catelist:
49. "/" # 拼出分类子目录的路径
50. # 获取class_path下的所有文件
51. for file_path in file_list: # 遍历类别目录下文件
52. # 拼出文件名全路径
53. bunch.label.append(mydir)
54. bunch.filenames.append(fullname)
55. # 读取文件内容
56. '''''append(element)是python list中的函数,意思是向原来的list中添加element,注意与extend()函数的区别'''
57. # 将bunch存储到wordbag_path路径中
58. "wb") as file_obj:
59. pickle.dump(bunch, file_obj)
60. print "构建文本对象结束!!!"
61.
62. if __name__ == "__main__":#这个语句前面的代码已经介绍过,今后不再注释
63. #对训练集进行Bunch化操作:
64. "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/train_set.dat" # Bunch存储路径,程序输出
65. "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_corpus_seg/" # 分词后分类语料库路径,程序输入
66. corpus2Bunch(wordbag_path, seg_path)
67.
68. # 对测试集进行Bunch化操作:
69. "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/test_set.dat" # Bunch存储路径,程序输出
70. "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_corpus_seg/" # 分词后分类语料库路径,程序输入
71. corpus2Bunch(wordbag_path, seg_path)
3.py(TFIDF_space.py)
[python] view plain copy
1. #!/usr/bin/env python
2. # -*- coding: UTF-8 -*-
3. """
4. @version: python2.7.8
5. @author: XiangguoSun
6. @contact: sunxiangguodut@qq.com
7. @file: TFIDF_space.py
8. @time: 2017/2/8 11:39
9. @software: PyCharm
10. """
11. import sys
12. reload(sys)
13. sys.setdefaultencoding('utf-8')
14.
15. from sklearn.datasets.base import Bunch
16. import cPickle as pickle
17. from sklearn.feature_extraction.text import TfidfVectorizer
18.
19. def _readfile(path):
20. "rb") as fp:
21. content = fp.read()
22. return content
23.
24. def _readbunchobj(path):
25. "rb") as file_obj:
26. bunch = pickle.load(file_obj)
27. return bunch
28.
29. def _writebunchobj(path, bunchobj):
30. "wb") as file_obj:
31. pickle.dump(bunchobj, file_obj)
32.
33. def vector_space(stopword_path,bunch_path,space_path,train_tfidf_path=None):
34.
35. stpwrdlst = _readfile(stopword_path).splitlines()
36. bunch = _readbunchobj(bunch_path)
37. tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={})
38.
39. if train_tfidf_path is not None:
40. trainbunch = _readbunchobj(train_tfidf_path)
41. tfidfspace.vocabulary = trainbunch.vocabulary
42. True, max_df=0.5,vocabulary=trainbunch.vocabulary)
43. tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
44.
45. else:
46. True, max_df=0.5)
47. tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
48. tfidfspace.vocabulary = vectorizer.vocabulary_
49.
50. _writebunchobj(space_path, tfidfspace)
51. print "tf-idf词向量空间实例创建成功!!!"
52.
53. if __name__ == '__main__':
54.
55. # stopword_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/hlt_stop_words.txt"#输入的文件
56. # bunch_path = "train_word_bag/train_set.dat"#输入的文件
57. # space_path = "train_word_bag/tfdifspace.dat"#输出的文件
58. # vector_space(stopword_path,bunch_path,space_path)
59. #
60. # bunch_path = "test_word_bag/test_set.dat"#输入的文件
61. # space_path = "test_word_bag/testspace.dat"
62. # train_tfidf_path="train_word_bag/tfdifspace.dat"
63. # vector_space(stopword_path,bunch_path,space_path,train_tfidf_path)
64.
65. "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/hlt_stop_words.txt"#输入的文件
66.
67. "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/train_set.dat"#输入的文件
68. "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/tfidfspace.dat"#输出的文件
69. vector_space(stopword_path,train_bunch_path,space_path)
70.
71. "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/tfidfspace.dat" # 输入的文件,由上面生成
72. "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/test_set.dat"#输入的文件
73. "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/testspace.dat"#输出的文件
74.
75. vector_space(stopword_path,test_bunch_path,test_space_path,train_tfidf_path)
4.py
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import cPickle as pickle
from sklearn.naive_bayes import MultinomialNB # 导入多项式贝叶斯算法
# 读取bunch对象
def _readbunchobj(path):
with open(path, "rb") as file_obj:
bunch = pickle.load(file_obj)
return bunch
# 导入训练集
trainpath = "../train_word_bag/tfidfspace.dat"
train_set = _readbunchobj(trainpath)
# 导入测试集
testpath = "../test_word_bag/testspace.dat"
test_set = _readbunchobj(testpath)
# 训练分类器:输入词袋向量和分类标签,alpha:0.001 alpha越小,迭代次数越多,精度越高
# clf = MultinomialNB(alpha=0.1).fit(train_set.tdm, train_set.label)
######################################################
from sklearn.ensemble import RandomForestClassifier
print '*************************随机森林分类器***********************'
clf = RandomForestClassifier(oob_score=True, random_state=10)
clf.fit(train_set.tdm, train_set.label)
# 预测分类结果
print '*************************开始预测************************'
predicted = clf.predict(test_set.tdm)
for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted):
if flabel != expct_cate:
print file_name,": 实际类别:",flabel," -->预测类别:",expct_cate
print "预测完毕!!!"
# 计算分类精度:
from sklearn import metrics
def metrics_result(actual, predict):
print '精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted'))
print '召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted'))
print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict,average='weighted'))
metrics_result(test_set.label, predicted)
依然使用复旦大学的新闻数据集
运行结果(这里复制一部分):
../test_corpus_seg/C37-Military/C37-Military008.txt : 实际类别: C37-Military -->预测类别: C31-Enviornment
../test_corpus_seg/C37-Military/C37-Military031.txt : 实际类别: C37-Military -->预测类别: C38-Politics
../test_corpus_seg/C37-Military/C37-Military105.txt : 实际类别: C37-Military -->预测类别: C39-Sports
../test_corpus_seg/C37-Military/C37-Military101.txt : 实际类别: C37-Military -->预测类别: C38-Politics
../test_corpus_seg/C37-Military/C37-Military006.txt : 实际类别: C37-Military -->预测类别: C38-Politics
../test_corpus_seg/C37-Military/C37-Military125.txt : 实际类别: C37-Military -->预测类别: C38-Politics
预测完毕!!!
精度:0.786
召回:0.790
f1-score:0.773