第三章-处理原始文本

原创

qq59caeb714a7a4 2022-08-16 11:48:48 博主文章分类：自然语言处理 ©著作权

文章标签 自然语言处理 ico lua html 文章分类 后端开发

©著作权归作者所有：来自51CTO博客作者qq59caeb714a7a4的原创作品，请联系作者获取转载授权，否则将追究法律责任

需要解决的问题？

txt在线文档下载

分词

创建text

根据内容定义开始与结尾

HTML下载

HTML解析

过滤无关内容

读取本地文件

正则表达式

查找ed结尾的词汇

字谜：8个字母，第3个字母是j，第6个字母是t

9宫格输入判断

正则表达式中的+

提取字符块

查找词干

搜索已分词文本

规范化文本

词干提取器

词性归并

分割

链表与字符串

字符串与格式

排列

--------------------------------------------------------------------------------------------------------------------------

# coding: utf-8


# In[1]:


from __future__ import division  # Python 2 users only
import nltk, re, pprint
from nltk import word_tokenize
from nltk.data import PathPointer, ZipFilePathPointer, find




# In[2]:


#txt在线文档下载
from urllib import urlopen
url = "http://www.gutenberg.org/files/2554/2554.txt"
response = urlopen(url)
raw = response.read().decode('utf8')
print type(raw)
print len(raw)
print raw[:75]




# In[3]:


#分词
tokens = word_tokenize(raw)
print type(tokens)
print len(tokens)
print tokens[:10]




# In[4]:


#创建text
text = nltk.Text(tokens)
print type(text)
print text[1024:1062]
print text.collocations()




# In[10]:


#根据内容定义开始与结尾
print raw.find("PART I")
print raw.rfind("End of Project Gutenberg's Crime")
raw = raw[5338:1157746]
# raw=raw[raw.find("PART I"):raw.rfind("End of Project Gutenberg's Crime")]
print raw.find("PART I")




# In[5]:


#HTML下载
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urlopen(url).read().decode('utf8')
html[:60]




# In[13]:


print html




# In[7]:


#HTML解析
from bs4 import BeautifulSoup
raw = BeautifulSoup(html,'lxml').get_text()
tokens = word_tokenize(raw)
tokens




# In[35]:


bs = BeautifulSoup(html,'lxml')
print bs.find("div",class_='bodytext').get_text()




# In[8]:


#过滤无关内容
tokens = tokens[110:390]
text = nltk.Text(tokens)
text.concordance('gene')




# In[9]:


print text




# In[14]:


#读取本地文件
f = open('document.txt')




# In[15]:


f = open('d:/data/document.txt')
f.read()




# In[16]:


f = open('d:/data/document.txt','rU')
for line in f:
    print(line.strip())




# In[13]:


raw = open('d:/data/document.txt').read()
print type(raw)
tokens = word_tokenize(raw)
print type(tokens)
words = [w.lower() for w in tokens]
print type(words)
vocab = sorted(set(words))
print type(vocab)




# In[28]:


vocab.append('blog')
raw.append('blog')




# In[29]:


query = 'Who knows?'
beatles = ['john', 'paul', 'george', 'ringo']
query + beatles




# In[24]:


#Unicode字符
path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt')
f= path.open(encoding='latin2')
for line in f:
    line = line.strip()
    print(line)




# In[46]:


f= path.open()
for line in f:
    line = line.strip()
    print(line)




# In[47]:


ord('a')




# In[48]:


a=u'\u0061'
a




# In[49]:


print a




# In[18]:


ord(u'ń')




# In[20]:


nacute =u'\u0144'
nacute




# In[21]:


nacute.encode('utf8')




# In[22]:


print nacute.encode('utf8')




# In[25]:


import unicodedata
lines = path.open( encoding='latin2').readlines()
line = lines[2]
print(line.encode('unicode_escape'))
for c in line: 
     if ord(c) > 127:
         print('%s U+%04x %s'% (c.encode('utf8'), ord(c), unicodedata.name(c)))




# In[26]:


line.find('zosta\u0142y')
line = line.lower()
line




# In[27]:


line.encode('unicode_escape')




# In[76]:


import re
m = re.search(u'\u015b\w*', line)
m.group()




# In[77]:


word_tokenize(line)




# In[28]:


#正则表达式
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]




# In[79]:


#查找ed结尾的词汇
[w for w in wordlist if re.search('ed$', w)]




# In[29]:


#字谜：8个字母，第3个字母是j，第6个字母是t
[w for w in wordlist if re.search('^..j..t..$', w)]




# In[81]:


#9宫格输入判断
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]




# In[30]:


#正则表达式中的+
chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))
[w for w in chat_words if re.search('^m+i+n+e+$', w)]




# In[31]:


[w for w in chat_words if re.search('^[ha]+$', w)]




# In[32]:


wsj = sorted(set(nltk.corpus.treebank.words()))
[w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)]




# In[84]:


[w for w in wsj if re.search(r'^[A-Z]+\$$', w)]




# In[33]:


[w for w in wsj if re.search('^[0-9]{4}$', w)]




# In[34]:


[w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)]




# In[87]:


[w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)]




# In[35]:


[w for w in wsj if re.search('(ed|ing)$', w)]




# In[36]:


#提取字符块
word = 'supercalifragilisticexpialidocious'
print re.findall(r'[aeiou]', word)
print len(re.findall(r'[aeiou]', word))




# In[90]:


wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(vs for word in wsj for vs in re.findall(r'[aeiou]{2,}', word))
fd.most_common(12)




# In[91]:


regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)


english_udhr = nltk.corpus.udhr.words('English-Latin1')
print(nltk.tokenwrap(compress(w) for w in english_udhr[:75]))




# In[38]:


rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
cfd = nltk.ConditionalFreqDist(cvs)
cfd.tabulate()




# In[39]:


cv_word_pairs = [(cv, w) for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
cv_index = nltk.Index(cv_word_pairs)
print cv_index['su']
print cv_index['po']




# In[40]:


#查找词干
def stem(word):
    for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
        if word.endswith(suffix):
             return word[:-len(suffix)]
    return word




# In[95]:


re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')




# In[96]:


re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')




# In[97]:


re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')




# In[98]:


re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')




# In[99]:


re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')




# In[100]:


re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$', 'language')




# In[41]:


def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem


raw = """DENNIS: Listen, strange women lying in ponds distributing swords
    is no basis for a system of government.  Supreme executive power derives from
    a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = word_tokenize(raw)
[stem(t) for t in tokens]




# In[102]:


#搜索已分词文本
from nltk.corpus import gutenberg, nps_chat
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
print moby.findall(r"<a> (<.*>) <man>")
chat = nltk.Text(nps_chat.words())
print chat.findall(r"<.*> <.*> <bro>") 
print chat.findall(r"<l.*>{3,}") 




# In[103]:


from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")




# In[42]:


###规范化文本###
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
    is no basis for a system of government.  Supreme executive power derives from
    a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = word_tokenize(raw)




# In[43]:


#词干提取器
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
[porter.stem(t) for t in tokens]




# In[45]:


[lancaster.stem(t) for t in tokens]




# In[108]:


#词性归并
wnl = nltk.WordNetLemmatizer()
[wnl.lemmatize(t) for t in tokens]




# In[121]:


####分割####




# In[123]:


len(nltk.corpus.brown.words()) / len(nltk.corpus.brown.sents())




# In[124]:


text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = nltk.sent_tokenize(text)
pprint.pprint(sents[79:89])




# In[125]:


#分词




# In[126]:


def segment(text, segs):
    words = []
    last = 0
    for i in range(len(segs)):
        if segs[i] == '1':
            words.append(text[last:i+1])
            last = i+1
    words.append(text[last:])
    return words




# In[127]:


text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"
segment(text, seg1)




# In[128]:


segment(text, seg2)




# In[134]:


def evaluate(text, segs):
    words = segment(text, segs)
    text_size = len(words)
    lexicon_size = len(' '.join(list(set(words))))
    return text_size + lexicon_size




# In[136]:


text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"
seg3 = "0000100100000011001000000110000100010000001100010000001"
print evaluate(text, seg3)
print evaluate(text, seg2)
print evaluate(text, seg1)




# In[50]:


from random import randint


def flip(segs, pos):
    return segs[:pos] + str(1-int(segs[pos])) + segs[pos+1:]


def flip_n(segs, n):
    for i in range(n):
        segs = flip(segs, randint(0, len(segs)-1))
    return segs


def anneal(text, segs, iterations, cooling_rate):
    temperature = float(len(segs))
    while temperature > 0.5:
        best_segs, best = segs, evaluate(text, segs)
        for i in range(iterations):
            guess = flip_n(segs, int(round(temperature,0)))
            score = evaluate(text, guess)
            if score < best:
                best, best_segs = score, guess
        score, segs = best, best_segs
        temperature = temperature / cooling_rate
        print(evaluate(text, segs), segment(text, segs))
    print()
    return segs




# In[151]:


text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
anneal(text, seg1, 5000, 1.2)




# In[152]:


####链表与字符串####




# In[155]:


#链表到字符串
silly = ['We', 'called', 'him', 'Tortoise', 'because', 'he', 'taught', 'us', '.']
print ' '.join(silly)
print ';'.join(silly)
print ''.join(silly)




# In[51]:


#字符串与格式
word = 'cat'
sentence = """hello 
                world"""
print(word)
print(sentence)




# In[52]:


word




# In[159]:


sentence




# In[53]:


fdist = nltk.FreqDist(['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat'])
for word in sorted(fdist):
    print(word, '->', fdist[word], '; ')




# In[54]:


for word in sorted(fdist):
    print '%s->%d;' % (word, fdist[word]),




# In[55]:


'%s->%d;' % ('cat',3)




# In[56]:


'%s->%d;' % 'cat'




# In[57]:


'%s->'%'cat'




# In[58]:


'%d'% 3




# In[59]:


'I want a %s right now' % 'coffee'




# In[60]:


print '%s wants a %s %s'%('Lee', 'sandwich', 'for lunch')




# In[67]:


template = 'Lee wants a %s right now'
menu = ('sandwich', 'spam fritter', 'pancake')
for snack in menu:
     print template % snack




# In[69]:


#排列
'%6s' % 'dog'




# In[179]:


'%-6s' % 'dog'




# In[70]:


width = 6
'%-*s' % (width, 'dog')




# In[181]:


count, total = 3205, 9375
"accuracy for %d words: %2.4f%%" % (total, 100 * count / total)




# In[183]:


def tabulate(cfdist, words, categories):
    print '%-16s' % 'Category',
    for word in words: # column headings
        print '%6s' % word,
    print
    for category in categories:
        print '%-16s' % category, # row heading
        for word in words: # for each word
            print '%6d' % cfdist[category][word], # print table cell
        print                                              # end the row
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
tabulate(cfd, modals, genres)




# In[71]:


output_file = open('output.txt', 'w')
words = set(nltk.corpus.genesis.words('english-kjv.txt'))
for word in sorted(words):
    output_file.write(word + "\n")




# In[185]:


len(words)




# In[186]:


str(len(words))




# In[72]:


output_file.write(str(len(words)) + "\n")
output_file.close()




# In[ ]: