网络爬虫(第一集:爬取网页信息)
原创
©著作权归作者所有:来自51CTO博客作者一片白纸的原创作品,请联系作者获取转载授权,否则将追究法律责任
BeautifulSoup对网页进行解析
from bs4 import BeautifulSoup
path = './web/new_index.html'
data = []
with open(path, 'r', encoding='gbk') as f:
Soup = BeautifulSoup(f.read(), 'lxml')
titles = Soup.select('body > div.main-content > ul > li > div.article-info > h3 > a') # 标题
pics = Soup.select('body > div.main-content > ul > li > img') # 图片
descs = Soup.select('body > div.main-content > ul > li > div.article-info > p.description') # 简介
rates = Soup.select('body > div.main-content > ul > li > div.rate > span') # 分数
cates = Soup.select('body > div.main-content > ul > li > div.article-info > p.meta-info') # 标签
# 将多个列表组装成字典
for title, pic, desc, rate, cate in zip(titles, pics, descs, rates, cates):
info = {
'title': title.get_text(),
'pic': pic.get('src'),
'descs': desc.get_text(),
'rate': rate.get_text(),
'cate': list(cate.stripped_strings)
}
data.append(info)
for item in data:
if len(item['rate']) >= 3: # 大于3分的内容
print(item['title'], item['cate'])
真实世界中的网页解析