python对文本进行分析和数据可视化,主要运用到了jieba,worldcloudmatplotlib,nxwworkx,pandas库,其他库在代码中给出。
1.首先准备好这三本名著
2.准备好停词词库

代码如下:

import matplotlib.pyplot as plt  
import matplotlib  
import networkx as nx  
import tkinter as tk  
import tkinter.ttk as ttk  
import pandas as pd  
matplotlib.rcParams['font.sans-serif']=['SimHei']

class Text:
###################获取词频###########################################
#读取文本文件数据
def __getText(self):
with open(self.path,“r”,encoding=“UTF-8”) as f:
self.__text=f.read()

#获取停用词  
def __stopwordslist(self):                     
    self.stopwords=[line.strip() for line in open(self.stoppath,'r',encoding="UTF-8").readlines()]  
      
#获取词频                                 
def __wordFreq(self,filepath,topn,text):        
    words=jieba.lcut(text.strip())  
    counts={}  
    self.__stopwordslist()     
    for word in words:  
        if len(word)==1:  #删除长度为1的字符  
            continue  
        elif word not in self.stopwords:  
            if word=="凤姐儿":  
                word="凤姐"  
            elif word=="林黛玉" or word=="林妹妹" or word=="黛玉笑":  
                word="黛玉"  
            elif word=="宝二爷":  
                word="宝玉"  
            elif word=="袭道人":  
                word="袭人"  
            counts[word]=counts.get(word,0)+1  
    items=list(counts.items())  
    items.sort(key=lambda x:x[1],reverse=True)           #获得词频  
    with open(filepath[:-4]+"_词频.txt","w") as f:  
        for i in range(topn):  
            word,count=items[i]  
            f.writelines("{}\t{}\n".format(word, count))  
              
##################画人物出场频率图############################  
def __drawbar(self):  
    with open(self.path1,"r") as f:  
        self.name=f.readlines()  
    self.dic={}  
    for item in self.name:  
        item=item.split("\t")  
        self.dic[item[0]]=int(item[1])  
    x=[]          #用来存储人物名称  
    y=[]          #用来存储出场次数  
    for item in self.dic:             
        x.append(item)  
        y.append(self.dic[item])  
  ##用pandas库画图###  
    dic={}  
    dic["名字"]=x  
    dic["次数"]=y  
    df=pd.DataFrame(dic)  
    df=df.set_index("名字")   
    df.plot(kind='bar',title=self.val1.get()+"人物出场频率图",figsize=(15,15))  
    plt.savefig(self.val1.get()+"人物出场频率图.jpg")  
    plt.show()  
      
####################################开始画词云############   
def __wordCloud(self,path):      
    plt.figure(figsize=(15,15))               
    with open (path,"r") as f:  
        text=f.read()  
        wcloud=wordcloud.WordCloud(background_color="white",width=1000,max_words=500,font_path=r"C:\Windows\Fonts\simhei.ttf",  
                                   height=860,margin=2).generate(text)  
        wcloud.to_file("{}cloud.png".format(self.val1.get()))  
        plt.imshow(wcloud)  
        plt.axis('off')  
        plt.show()  
                          
###################找到文本中的人物关系################################  
def __getrelations(self):  
      
    if self.val1.get()=='红楼梦':  
        self.Names=["宝玉","凤姐","贾母","王夫人","老太太","袭人","贾琏","平儿","宝钗","薛姨妈",  
                "探春","鸳鸯","贾政","晴雯","湘云","刘姥姥","邢夫人","贾珍","紫鹃","香菱",  
                "尤氏","薛蟠","贾赦"]  
    if self.val1.get()=='水浒传':  
        self.Names=["宋江","李逵","吴用","公孙胜","关胜","林冲","秦明","呼延","花荣","柴进",  
                    "燕青","朱仝","鲁智深","武松","董平","秦明","李俊","卢俊义","晁盖","戴宗"]  
    if self.val1.get()=="三国演义":  
        self.Names=["刘备","刘禅","关公","张飞","赵云","诸葛亮","徐遮","马良","黄忠","玄德","曹丕","孙权"  
                    ,"司马懿","周瑜","孔明","卢布","周瑜","袁绍","马超","魏延","姜维","马岱","庞德"]  
          
    f=open(self.path,'r',encoding="UTF-8")  
    s=f.read()  
    self.relations={}  
    self.lst_para=s.split('\n')  #安段落划分  
    for text in self.lst_para:  
        for name1 in self.Names:  
            if name1 in text:  
                for name2 in self.Names:  
                    if name2 in text and name1 !=name2 and (name2,name1) not in self.relations:  
                        self.relations[(name1,name2)]=self.relations.get((name1,name2),0)+1  
    self.maxRela=max([v for k,v in self.relations.items()]) #取最大共现次数  
    self.relations={k:v/self.maxRela for k,v in self.relations.items()}      
      
###########################画出人物关系图###################################################  
def __getmap(self):  
    plt.figure(figsize=(15,15))  
    self.G=nx.Graph()  
    #根据relations的数据想G中添加边  
    for k,v in self.relations.items():  
        self.G.add_edge(k[0],k[1],weight=v)  
    #筛选权重大于0.6的边  
    self.elarge=[(u,v) for (u,v,d) in self.G.edges(data=True) if d['weight']>0.6]  
    #筛选权重大于0.3但小于0.6的边  
    self.emidle=[(u,v) for (u,v,d) in self.G.edges(data=True) if (d['weight']>0.3) & (d['weight']<=0.6)]  
    #筛选权重小于0.3的边  
    self.esmall=[(u,v) for (u,v,d) in self.G.edges(data=True) if d['weight']<=0.3]  
    #设置图形布局  
    self.pos=nx.circular_layout(self.G)  
    nx.draw_networkx_nodes(self.G,self.pos,alpha=0.6,node_size=800)  
    #alpha是透明度,width是连接线的宽度  
    nx.draw_networkx(self.G, self.pos,edgelist=self.elarge,width=2.5,alpha=0.9,edge_color='g')  
    nx.draw_networkx(self.G, self.pos,edgelist=self.emidle,width=1.5,alpha=0.6,edge_color='y')  
    nx.draw_networkx(self.G, self.pos,edgelist=self.esmall,width=1,alpha=0.2,edge_color='b',style='dashed')  
    nx.draw_networkx_labels(self.G,self.pos,font_size=12)  
    plt.axis('off')  
    plt.title("{}主要人物关系网络".format(self.val1.get()))  
    plt.show()  
      
 ################上面函数的调用#####################     
def __getvar(self,event):  
    self.path=self.val1.get()+".txt"            #文本路径  
    self.__getText()     #读取文本文件数据   
    self.__wordFreq(self.path,20,self.__text)  #获取20个词的词频  
    self.path1=self.val1.get()+'_词频.txt'      #词频路径  
    self.__wordCloud(self.path1)  #画词频云图  
    self.__getrelations()                       #获得文本中人物关系  
    self.__getmap()                             #画出人物关系图  
    self.__drawbar()  
      
  #####################构造函数,GUI界面的设计#####################    
def __init__(self):          
    self.window=tk.Tk()  
    self.window.geometry("400x400")  
    self.window.title("文本选择器")  
    self.txt=["红楼梦","水浒传","三国演义"]  
    self.val1=tk.StringVar()  
    self.cb=ttk.Combobox(self.window,textvariable=self.val1)  
    self.cb['value']=self.txt  
    self.cb.current()          
    self.cb.place(x=100,y=60)  
    self.cb.bind("<<ComboboxSelected>>",self.__getvar)  
    self.stoppath="C:/Users/Alison/Desktop/PyExp3/stop_words.txt"   #停词文本路径  
    self.lab1=tk.Label(self.window,text="请选择你想处理的文本",font="楷体").place(x=100,y=30)  
    self.window.mainloop()

def main():
txt=Text()

if “name”==main():
main()



  • 运行结果如下:
  • python 红楼么 python红楼梦文本分析_python


  • python 红楼么 python红楼梦文本分析_python 红楼么_02


  • python 红楼么 python红楼梦文本分析_爬虫_03


  • python 红楼么 python红楼梦文本分析_python_04

  • 因为需要分析整个文本,分析,话云图等问题,其实程序运行得还是比较慢的,这是一个没有改进的地方,希望大家多多指教!!



python对文本进行分析和数据可视化,主要运用到了jieba,worldcloudmatplotlib,nxwworkx,pandas库,其他库在代码中给出。
1.首先准备好这三本名著
2.准备好停词词库

代码如下:

import matplotlib.pyplot as plt  
import matplotlib  
import networkx as nx  
import tkinter as tk  
import tkinter.ttk as ttk  
import pandas as pd  
matplotlib.rcParams['font.sans-serif']=['SimHei']