Python爬取网站内容并进行文字预处理(英文)

  • 注:输出部分用省略号代替...
  • 爬取网站

'''
import urllib.request

response = urllib.request.urlopen('http://php.net/')
html = response.read()
print(html)
'''

  • 输出:

'''
b'\n\n\n\n \n \n\n

PHP: Hypertext Preprocessor\n\n \n \n <link rel="alternate" type="application/atom+xml" href="http://php.net/releases/feed.php" ...

'''

  • 转换为干净文本

'''
import urllib.request
from bs4 import BeautifulSoup

response = urllib.request.urlopen('http://php.net/')
html = response.read()
soup=BeautifulSoup(html,"html5lib") # 这需要安装html5lib模块
text = soup.get_text(strip=True)

-- text -- 获取了一个干净的文本

print(text)
'''
输出为:
'''
PHP: Hypertext PreprocessorDownloadsDocumentationGet InvolvedHelpGetting StartedIntroductionA simple tutorialLanguage ReferenceBasic ......
'''

  • 转换为tokens
    '''
    import urllib.request
    from bs4 import BeautifulSoup

response = urllib.request.urlopen('http://php.net/')
html = response.read()
soup=BeautifulSoup(html,"html5lib") # 这需要安装html5lib模块
text = soup.get_text(strip=True)

-- text -- 获取了一个干净的文本

-- 将文本转换为tokens

tokens = text.split()
print(tokens)
'''
输出为:
'''
['PHP:', 'Hypertext', 'PreprocessorDownloadsDocumentationGet', 'InvolvedHelpGetting', 'StartedIntroductionA', 'simple', 'tutorialLanguage', 'ReferenceBasic',...'''

  • 完整版 python爬取文字加分词预处理(英文)

'''
import nltk

nltk.download()

import urllib.request
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords

response = urllib.request.urlopen('http://php.net/')
html = response.read()
soup=BeautifulSoup(html,"html5lib") # 这需要安装html5lib模块
text = soup.get_text(strip=True)

-- text -- 获取了一个干净的文本

-- 将文本转换为tokens

tokens = text.split()

# -- 计算频率

freq = nltk.FreqDist(tokens)

for key,val in freq.items():

print(str(key)+':'+str(val))

# -- 画图

freq.plot(20,cumulative=False)

-- 处理停用词

stopwords.words('english') # 注:使用这个需要提前nltk.download()下载所需资源

clean_tokens = list()
sr = stopwords.words('english')

处理停用词

for token in tokens:
if token not in sr:
clean_tokens.append(token)

-- 计算频率

freq = nltk.FreqDist(clean_tokens)
for key,val in freq.items():
print(str(key)+':'+str(val))

-- 画图

freq.plot(20,cumulative=False)

'''