总览
- 前言
- 功能函数与展示
- 获取本地txt文件内容
- 将停用词文件的词读入到列表stopwords中
- 分词并统计词频
- 分词结果
- 词云
- 更换词云形状
- 章回统计
- 统计每一回中玄德出现的次数
- 统计“曹贼”“大耳贼”“美髯公”"汉贼"
- 三国兴衰
- 平均段落数与字数
- 社交网络
- 换一种布局
- 代码获取
前言
本项目主要利用python的jieba库对三国演义进行词频分析,并通过WordCloud、networkx,matplotlib等库进行可视化分析.
功能函数与展示
获取本地txt文件内容
def getText(filepath):
f = open(filepath, 'r', encoding = 'utf-8')
text = f.read()
f.close()
return text #返回文本内容
将停用词文件的词读入到列表stopwords中
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding = 'utf-8').readlines()]
return stopwords
分词并统计词频
def wordFreq(filepath, text, topn):
#使用lcut()方法对文本分词
words = jieba.lcut(text.strip())
#出现频率存放在字典count
count = {}
stopwords = stopwordslist(r'C:\Users\Hasee\Desktop/my_stop_words.txt')
#词频排序
items = list(count.items())
items.sort(key = lambda x:x[1], reverse = True)
分词结果
filepath = r'C:\Users\Hasee\Desktop/三国演义.txt'
wordFreq(filepath, getText(filepath), 200)
可以看出,曹操是妥妥的主角(狗头)
词云
f1 = open(r'C:\Users\Hasee\Desktop/三国演义_词频统计.txt')
text = f1.read()
wcloud = wordcloud.WordCloud(background_color = 'white', width = 1000, max_words =500, height = 400, margin =2, font_path = r'C:\Windows\Fonts/SIMHEI.TTF').generate(text)
更换词云形状
#制作词云,更换形状
f1 = open(r'C:\Users\Hasee\Desktop/三国演义_词频统计.txt')
text = f1.read()
bg_pic = imread(r'C:\Users\Hasee\Desktop/star.png')
wcloud = wordcloud.WordCloud(mask = bg_pic, background_color = 'white', width = 800, max_words =800, height = 400, margin =2, font_path = r'C:\Windows\Fonts/SIMHEI.TTF').generate(text)
章回统计
#章回处理
f= open(r'C:\Users\Hasee\Desktop/三国演义.txt', 'r', encoding='utf-8')
s = f.read()
#获取章节标题
lst_chapter = []
chapter = re.findall("第[\u4E00-\u9FA5]+回", s)
#遍历chapter ,存储到lst_chapter
for x in chapter:
if x not in lst_chapter:
lst_chapter.append(x)
统计每一回中玄德出现的次数
cnt_xuande1 = []
cnt_xuande2 = []
for ii in range(120):
start = lst_chapterindex[ii][0]
end = lst_chapterindex[ii][1]
cnt_xuande1.append(s[start:end].count('玄德'))
cnt_xuande2.append(s[start:end].count('刘备'))
#绘图
plt.figure(figsize = (20,10))
plt.plot(list(range(len(cnt_xuande1))), np.array(cnt_xuande1)+np.array(cnt_xuande2),'r')
plt.xticks(list(range(len(cnt_xuande1))), rotation=-90)
plt.xlabel('章节数')
plt.ylabel('出现次数')
plt.title('刘备出现次数')
看来托孤白帝城之后就少被后人提及了,果然滚滚长江东逝水
统计“曹贼”“大耳贼”“美髯公”“汉贼”
for ii in range(120):
start = lst_chapterindex[ii][0]
end = lst_chapterindex[ii][1]
cnt_cz.append(s[start:end].count('曹贼'))
cnt_dez.append(s[start:end].count('大耳贼'))
cnt_sb.append(s[start:end].count('美髯公'))
cnt_hz.append(s[start:end].count('汉贼'))
曹贼和汉贼还是多的
三国兴衰
for ii in range(120):
start = lst_chapterindex[ii][0]
end = lst_chapterindex[ii][1]
cnt_cz.append(s[start:end].count('蜀'))
cnt_dez.append(s[start:end].count('魏'))
cnt_sb.append(s[start:end].count('东吴'))
cnt_hz.append(s[start:end].count('汉'))
曹老板的基业还是强大
平均段落数与字数
社交网络
在这里统计前30个出现次数最多的人物
relations = {}
lst_para = s.split('\n') #按段落划分,假设出现在统一段落中人物具有共现关系
for text in lst_para:
for name1 in Names:
# for name1 in text:
for name2 in Names:
if name1 in text and name2 in text and name1 !=name2 and (name2,name1) not in relations:
relations[(name1, name2)] = relations.get((name1,name2), 0) +1
换一种布局
pos = nx.circular_layout(G)
#设置结点样式
nx.draw_networkx_nodes(G, pos, alpha = 0.8, node_size=800)
nx.draw_networkx_edges(G, pos, edgelist = elarge, width=2.5, alpha = 0.9, edge_color = 'g')
nx.draw_networkx_edges(G, pos, edgelist = emidle, width=1.5, alpha = 0.6, edge_color = 'y')
nx.draw_networkx_edges(G, pos, edgelist = esmall, width=1, alpha = 0.4, edge_color = 'b', style ='dashed')