Python爬取网站内容并进行文字预处理(英文)
- 注:输出部分用省略号代替...
- 爬取网站
'''
import urllib.request
response = urllib.request.urlopen('http://php.net/')
html = response.read()
print(html)
'''
- 输出:
'''
b'\n\n\n\n \n \n\n
PHP: Hypertext Preprocessor\n\n \n \n <link rel="alternate" type="application/atom+xml" href="http://php.net/releases/feed.php" ...
'''
- 转换为干净文本
'''
import urllib.request
from bs4 import BeautifulSoup
response = urllib.request.urlopen('http://php.net/')
html = response.read()
soup=BeautifulSoup(html,"html5lib") # 这需要安装html5lib模块
text = soup.get_text(strip=True)
-- text -- 获取了一个干净的文本
print(text)
'''
输出为:
'''
PHP: Hypertext PreprocessorDownloadsDocumentationGet InvolvedHelpGetting StartedIntroductionA simple tutorialLanguage ReferenceBasic ......
'''
- 转换为tokens
'''
import urllib.request
from bs4 import BeautifulSoup
response = urllib.request.urlopen('http://php.net/')
html = response.read()
soup=BeautifulSoup(html,"html5lib") # 这需要安装html5lib模块
text = soup.get_text(strip=True)
-- text -- 获取了一个干净的文本
-- 将文本转换为tokens
tokens = text.split()
print(tokens)
'''
输出为:
'''
['PHP:', 'Hypertext', 'PreprocessorDownloadsDocumentationGet', 'InvolvedHelpGetting', 'StartedIntroductionA', 'simple', 'tutorialLanguage', 'ReferenceBasic',...'''
- 完整版 python爬取文字加分词预处理(英文)
'''
import nltk
nltk.download()
import urllib.request
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
response = urllib.request.urlopen('http://php.net/')
html = response.read()
soup=BeautifulSoup(html,"html5lib") # 这需要安装html5lib模块
text = soup.get_text(strip=True)
-- text -- 获取了一个干净的文本
-- 将文本转换为tokens
tokens = text.split()
# -- 计算频率
freq = nltk.FreqDist(tokens)
for key,val in freq.items():
print(str(key)+':'+str(val))
# -- 画图
freq.plot(20,cumulative=False)
-- 处理停用词
stopwords.words('english') # 注:使用这个需要提前nltk.download()下载所需资源
clean_tokens = list()
sr = stopwords.words('english')
处理停用词
for token in tokens:
if token not in sr:
clean_tokens.append(token)
-- 计算频率
freq = nltk.FreqDist(clean_tokens)
for key,val in freq.items():
print(str(key)+':'+str(val))
-- 画图
freq.plot(20,cumulative=False)
'''