jieba 细粒度分词 / add_word无效 / 强制分词
原创
©著作权归作者所有:来自51CTO博客作者TechOnly的原创作品,请联系作者获取转载授权,否则将追究法律责任
def fenci(one_string):
for _ in range(len(one_string)): # 去掉所有空格
try:
one_string=one_string.replace(" ","")
except:
break
def isAllZh(s): # 判断是否全是中文
for c in s:
if not ('\u4e00' <= c <= '\u9fa5'):
return False
return True
final_result = []
temp_list = jieba.lcut(one_string)
for word in temp_list:
if isAllZh(word)==False:
continue
# if jieba.get_FREQ(word)==1:
# print(word)
if (len(word)>1 and (jieba.get_FREQ(word)==None or jieba.get_FREQ(word)==0)) or len(word)>3:
jieba.del_word(word) # 强制
final_result.extend(jieba.lcut(word))
else:
final_result.append(word)
return
事实上和HMM=False的结果貌似差不多
print(jieba.lcut('丰田太省了', HMM=False))
print(jieba.lcut('我们中出了一个叛徒', HMM=False))
print(jieba.lcut('丰田太省了', HMM=True))
print(jieba.lcut('我们中出了一个叛徒', HMM=True))