需要解决的问题?
txt在线文档下载
分词
创建text
根据内容定义开始与结尾
HTML下载
HTML解析
过滤无关内容
读取本地文件
正则表达式
查找ed结尾的词汇
字谜:8个字母,第3个字母是j,第6个字母是t
9宫格输入判断
正则表达式中的+
提取字符块
查找词干
搜索已分词文本
规范化文本
词干提取器
词性归并
分割
链表与字符串
字符串与格式
排列
--------------------------------------------------------------------------------------------------------------------------
# coding: utf-8
# In[1]:
from __future__ import division # Python 2 users only
import nltk, re, pprint
from nltk import word_tokenize
from nltk.data import PathPointer, ZipFilePathPointer, find
# In[2]:
#txt在线文档下载
from urllib import urlopen
url = "http://www.gutenberg.org/files/2554/2554.txt"
response = urlopen(url)
raw = response.read().decode('utf8')
print type(raw)
print len(raw)
print raw[:75]
# In[3]:
#分词
tokens = word_tokenize(raw)
print type(tokens)
print len(tokens)
print tokens[:10]
# In[4]:
#创建text
text = nltk.Text(tokens)
print type(text)
print text[1024:1062]
print text.collocations()
# In[10]:
#根据内容定义开始与结尾
print raw.find("PART I")
print raw.rfind("End of Project Gutenberg's Crime")
raw = raw[5338:1157746]
# raw=raw[raw.find("PART I"):raw.rfind("End of Project Gutenberg's Crime")]
print raw.find("PART I")
# In[5]:
#HTML下载
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urlopen(url).read().decode('utf8')
html[:60]
# In[13]:
print html
# In[7]:
#HTML解析
from bs4 import BeautifulSoup
raw = BeautifulSoup(html,'lxml').get_text()
tokens = word_tokenize(raw)
tokens
# In[35]:
bs = BeautifulSoup(html,'lxml')
print bs.find("div",class_='bodytext').get_text()
# In[8]:
#过滤无关内容
tokens = tokens[110:390]
text = nltk.Text(tokens)
text.concordance('gene')
# In[9]:
print text
# In[14]:
#读取本地文件
f = open('document.txt')
# In[15]:
f = open('d:/data/document.txt')
f.read()
# In[16]:
f = open('d:/data/document.txt','rU')
for line in f:
print(line.strip())
# In[13]:
raw = open('d:/data/document.txt').read()
print type(raw)
tokens = word_tokenize(raw)
print type(tokens)
words = [w.lower() for w in tokens]
print type(words)
vocab = sorted(set(words))
print type(vocab)
# In[28]:
vocab.append('blog')
raw.append('blog')
# In[29]:
query = 'Who knows?'
beatles = ['john', 'paul', 'george', 'ringo']
query + beatles
# In[24]:
#Unicode字符
path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt')
f= path.open(encoding='latin2')
for line in f:
line = line.strip()
print(line)
# In[46]:
f= path.open()
for line in f:
line = line.strip()
print(line)
# In[47]:
ord('a')
# In[48]:
a=u'\u0061'
a
# In[49]:
print a
# In[18]:
ord(u'ń')
# In[20]:
nacute =u'\u0144'
nacute
# In[21]:
nacute.encode('utf8')
# In[22]:
print nacute.encode('utf8')
# In[25]:
import unicodedata
lines = path.open( encoding='latin2').readlines()
line = lines[2]
print(line.encode('unicode_escape'))
for c in line:
if ord(c) > 127:
print('%s U+%04x %s'% (c.encode('utf8'), ord(c), unicodedata.name(c)))
# In[26]:
line.find('zosta\u0142y')
line = line.lower()
line
# In[27]:
line.encode('unicode_escape')
# In[76]:
import re
m = re.search(u'\u015b\w*', line)
m.group()
# In[77]:
word_tokenize(line)
# In[28]:
#正则表达式
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
# In[79]:
#查找ed结尾的词汇
[w for w in wordlist if re.search('ed$', w)]
# In[29]:
#字谜:8个字母,第3个字母是j,第6个字母是t
[w for w in wordlist if re.search('^..j..t..$', w)]
# In[81]:
#9宫格输入判断
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]
# In[30]:
#正则表达式中的+
chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))
[w for w in chat_words if re.search('^m+i+n+e+$', w)]
# In[31]:
[w for w in chat_words if re.search('^[ha]+$', w)]
# In[32]:
wsj = sorted(set(nltk.corpus.treebank.words()))
[w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)]
# In[84]:
[w for w in wsj if re.search(r'^[A-Z]+\$$', w)]
# In[33]:
[w for w in wsj if re.search('^[0-9]{4}$', w)]
# In[34]:
[w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)]
# In[87]:
[w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)]
# In[35]:
[w for w in wsj if re.search('(ed|ing)$', w)]
# In[36]:
#提取字符块
word = 'supercalifragilisticexpialidocious'
print re.findall(r'[aeiou]', word)
print len(re.findall(r'[aeiou]', word))
# In[90]:
wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(vs for word in wsj for vs in re.findall(r'[aeiou]{2,}', word))
fd.most_common(12)
# In[91]:
regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
def compress(word):
pieces = re.findall(regexp, word)
return ''.join(pieces)
english_udhr = nltk.corpus.udhr.words('English-Latin1')
print(nltk.tokenwrap(compress(w) for w in english_udhr[:75]))
# In[38]:
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
cfd = nltk.ConditionalFreqDist(cvs)
cfd.tabulate()
# In[39]:
cv_word_pairs = [(cv, w) for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
cv_index = nltk.Index(cv_word_pairs)
print cv_index['su']
print cv_index['po']
# In[40]:
#查找词干
def stem(word):
for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
if word.endswith(suffix):
return word[:-len(suffix)]
return word
# In[95]:
re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
# In[96]:
re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
# In[97]:
re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
# In[98]:
re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')
# In[99]:
re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')
# In[100]:
re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$', 'language')
# In[41]:
def stem(word):
regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
stem, suffix = re.findall(regexp, word)[0]
return stem
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government. Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = word_tokenize(raw)
[stem(t) for t in tokens]
# In[102]:
#搜索已分词文本
from nltk.corpus import gutenberg, nps_chat
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
print moby.findall(r"<a> (<.*>) <man>")
chat = nltk.Text(nps_chat.words())
print chat.findall(r"<.*> <.*> <bro>")
print chat.findall(r"<l.*>{3,}")
# In[103]:
from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")
# In[42]:
###规范化文本###
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government. Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = word_tokenize(raw)
# In[43]:
#词干提取器
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
[porter.stem(t) for t in tokens]
# In[45]:
[lancaster.stem(t) for t in tokens]
# In[108]:
#词性归并
wnl = nltk.WordNetLemmatizer()
[wnl.lemmatize(t) for t in tokens]
# In[121]:
####分割####
# In[123]:
len(nltk.corpus.brown.words()) / len(nltk.corpus.brown.sents())
# In[124]:
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = nltk.sent_tokenize(text)
pprint.pprint(sents[79:89])
# In[125]:
#分词
# In[126]:
def segment(text, segs):
words = []
last = 0
for i in range(len(segs)):
if segs[i] == '1':
words.append(text[last:i+1])
last = i+1
words.append(text[last:])
return words
# In[127]:
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"
segment(text, seg1)
# In[128]:
segment(text, seg2)
# In[134]:
def evaluate(text, segs):
words = segment(text, segs)
text_size = len(words)
lexicon_size = len(' '.join(list(set(words))))
return text_size + lexicon_size
# In[136]:
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"
seg3 = "0000100100000011001000000110000100010000001100010000001"
print evaluate(text, seg3)
print evaluate(text, seg2)
print evaluate(text, seg1)
# In[50]:
from random import randint
def flip(segs, pos):
return segs[:pos] + str(1-int(segs[pos])) + segs[pos+1:]
def flip_n(segs, n):
for i in range(n):
segs = flip(segs, randint(0, len(segs)-1))
return segs
def anneal(text, segs, iterations, cooling_rate):
temperature = float(len(segs))
while temperature > 0.5:
best_segs, best = segs, evaluate(text, segs)
for i in range(iterations):
guess = flip_n(segs, int(round(temperature,0)))
score = evaluate(text, guess)
if score < best:
best, best_segs = score, guess
score, segs = best, best_segs
temperature = temperature / cooling_rate
print(evaluate(text, segs), segment(text, segs))
print()
return segs
# In[151]:
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
anneal(text, seg1, 5000, 1.2)
# In[152]:
####链表与字符串####
# In[155]:
#链表到字符串
silly = ['We', 'called', 'him', 'Tortoise', 'because', 'he', 'taught', 'us', '.']
print ' '.join(silly)
print ';'.join(silly)
print ''.join(silly)
# In[51]:
#字符串与格式
word = 'cat'
sentence = """hello
world"""
print(word)
print(sentence)
# In[52]:
word
# In[159]:
sentence
# In[53]:
fdist = nltk.FreqDist(['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat'])
for word in sorted(fdist):
print(word, '->', fdist[word], '; ')
# In[54]:
for word in sorted(fdist):
print '%s->%d;' % (word, fdist[word]),
# In[55]:
'%s->%d;' % ('cat',3)
# In[56]:
'%s->%d;' % 'cat'
# In[57]:
'%s->'%'cat'
# In[58]:
'%d'% 3
# In[59]:
'I want a %s right now' % 'coffee'
# In[60]:
print '%s wants a %s %s'%('Lee', 'sandwich', 'for lunch')
# In[67]:
template = 'Lee wants a %s right now'
menu = ('sandwich', 'spam fritter', 'pancake')
for snack in menu:
print template % snack
# In[69]:
#排列
'%6s' % 'dog'
# In[179]:
'%-6s' % 'dog'
# In[70]:
width = 6
'%-*s' % (width, 'dog')
# In[181]:
count, total = 3205, 9375
"accuracy for %d words: %2.4f%%" % (total, 100 * count / total)
# In[183]:
def tabulate(cfdist, words, categories):
print '%-16s' % 'Category',
for word in words: # column headings
print '%6s' % word,
for category in categories:
print '%-16s' % category, # row heading
for word in words: # for each word
print '%6d' % cfdist[category][word], # print table cell
print # end the row
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
tabulate(cfd, modals, genres)
# In[71]:
output_file = open('output.txt', 'w')
words = set(nltk.corpus.genesis.words('english-kjv.txt'))
for word in sorted(words):
output_file.write(word + "\n")
# In[185]:
len(words)
# In[186]:
str(len(words))
# In[72]:
output_file.write(str(len(words)) + "\n")
output_file.close()
# In[ ]: