# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------------------------------------------------
__Author__ = 'assasin'
__DateTime__ = '2020/1/4 19:23'
#-----------------------------------------------------------------------------------------------------------------------

from stop_words import readFile,seg_doc
# pip install sklearn
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

# 利用sklearn 计算tfidf值特征

def sklearn_tfidf_feature(corpus=None):
# 构建词汇表
vectorize = CountVectorizer()
# 该类会统计每一个词语的tfidf值
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorize.fit_transform(corpus))
# print(tfidf)
# 获取词袋模型中所有的词语
words = vectorize.get_feature_names()
# 将tf-idf矩阵抽取出来,元素a[i][j]表示此词在i类文本中的权重
weight = tfidf.toarray()
# print(weight)
for i in range(len(weight)):
print(u"-----这里输出第",i,u"类文本的词语tf-idf权重")
for j in range(len(words)):
print(words[j],weight[i][j])

if __name__ == '__main__':
corpus = []
path = r'./datas/体育/11.txt'
str_doc = readFile(path)
word_list1 = ' '.join(seg_doc(str_doc))
# print(word_list1)

path = r'./datas/时政/339764.txt'
str_doc = readFile(path)
word_list2 = ' '.join(seg_doc(str_doc))
# print(word_list2)

corpus.append(word_list1)
corpus.append(word_list2)
# print(corpus)
sklearn_tfidf_feature(corpus)