爬虫学习3——BeautifulSoup

原创

时间带 2018-06-04 10:01:29 博主文章分类：python ©著作权

文章标签 python 爬虫入门 BeautifulSoup 文章分类 Python 后端开发

©著作权归作者所有：来自51CTO博客作者时间带的原创作品，请联系作者获取转载授权，否则将追究法律责任

没啥废话，直接开始吧，新建一个Python文件，对着练习就行了。可以添加print查看程序执行情况。

from bs4 import BeautifulSoup
#测试的网页源码
html_doc = """    
<html>
		<head>
				<title>The Dormouse's story</title>
		</head>
		<body>
    <p class="story">Once upon a time there were many children and their names were
    <a rel="nofollow" href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
    <a rel="nofollow" href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a rel="nofollow" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    <a rel="nofollow" href="http://example.com/Bob" class="sister" id="link4">Bob</a>;
    <a rel="nofollow" href="http://example.com/King" class="brother" id="link5">King</a>;
    <a rel="nofollow" href="http://example.com/Mary" class="sister" id="link6">Mary</a>;
    <a rel="nofollow" href="http://example.com/Target" class="sister" id="link7">Target said：“I am very strong”</a>;
    <a rel="nofollow" href="http://example.com/Jack" class="brother" id="link8">Jack</a>;
    and they lived at the bottom of a well.
    </p>


				<p class="story">...</p>
"""
	
bs = BeautifulSoup(html,'lxml')   #使用lmxl进行解析html

doc = bs.prettify    #把代码格式化输出

doc = bs.title.string   #获取title标签的内容
doc = bs.title.text    #同样是获取title标签的内容
doc = bs.a.text    #获取a标签的内容
doc = bs.a.string  #获取a标签的内容

doc = bs.title   #获取title标签

doc = bs.head   #获取出head标签

doc = bs.body.a  #获取body下的a标签，但是只是打印第一个，镶嵌选择

doc = bs.p['class']   #获取p标签的属性class属性

doc = bs.find_all('a')   #获取所有的a标签

doc = bs.find('a')   #查找a标签，只是返回查找的第一个

doc = bs.a.parent   #获取a标签的父标签

doc = bs.a.parents   #获取a标签的祖先标签
#print(type(doc))   #祖先标签是generator类型，通过for循环打印
#for item in doc:
#    print(item)

doc = bs.a.next_sibling  #获取a标签的下一个兄弟节点
#print(doc)
#for item in doc:   获取所有兄弟节点靠for循环输出
#    print(item)

#find_next_silbings()  返回后面的所有兄弟标签
#find_previous_sibilings()  返回前面的所有兄弟标签
#find_next_silbing()  返回后面的第一个兄弟标签
#find_previous_sibiling()  返回前面的第一个兄弟标签


doc = bs.find_all('a')  #查找所有的a标签

doc = bs.find_all(attrs={'id':'link1'})   #通过属性查找所有的标签
doc = bs.find_all(attrs={'id':'link3'})   #通过属性查找所有的标签

doc = bs.find_all(id='link3')   #通过id直接查找，而不是通过字典查找
doc = bs.find_all(class_='brother')   #class后面有个 _


doc = bs.find_all(text='Target') #根据文本内容查找，文本内容必须要完全匹配才能查找上，这个就找不到
doc = bs.find_all(text='Bob')  #这个能查找上

#find_all_next()  返回节点后所有符合条件的节点
#find_next()   返回节点后第一个符合条件的节点


doc = bs.select('#link3')  #这里的select是bs中内置的css选择器，可以直接通过css选择
doc = bs.select('.brother')  #通过id进行查找
doc = bs.select('p a')  #获取p标签下的所有a标签


doc = bs.select('a')   #获取所有的a标签，并输出每个的href的属性内容，需要通过for循环输出
#for item in doc:
#    print(item['href'])

doc = bs.select('a')   #获取所有的a标签，并输出每个标签的内容，需要通过for循环输出
#for item in doc:
#    print(item.text)

实战：爬去豆瓣上的指定的电影信息：

import requests
from bs4 import BeautifulSoup
from urllib.parse import quote
import re

def write_info(head,body):   #将获得信息写入txt
		with open('moveinfo.txt','a',encoding='utf-8') as f:
				f.write(head+body+'\n\n')
				f.close()

def get_info(type,url,name):   #获取演员，上映时间...等详细信息
		response = requests.get(url).text
		doc = BeautifulSoup(response,'lxml')
		info = doc.select('#info')       #查找到info标签
		for item in info:
				head = type + ":" + name
				print('正在加载 ',type,":","《" +name+ "》",' 信息......')
				print(head)
				item = item.text
				write_info(head,item)

def get_url(search):
		url = 'https://www.douban.com/search?cat=1002&q=' + quote(search)  #对搜索的内容进行编码
		response = requests.get(url)   #发起请求
		doc = BeautifulSoup(response.text,'lxml')  #使用bs进行解析
		doc = doc.find_all('h3')
		doc = str(doc)
		pattern = re.compile('<h3>.*?<span>\[(.*?)\]</spa.*?href="(.*?)".*?target.*?>(.*?)</a>',re.S)
		result = re.findall(pattern,doc)
		for item in result:
			print('====================================================')
			get_info(item[0],item[1],item[2])

if __name__ == '__main__':
		search = input('请输入要收集的电影信息：')    
		get_url(search)