一.python爬虫简介
1.什么是爬虫:
网络爬虫,是一种按照一定规则,自动抓取互联网信息的程序或者脚本。由于互联网数据的多样性和资源的有限性,根据用户需求定向抓取相关网页并分析已成为如今主流的爬取策略。
2.爬虫的作用:
网络抓取图片,爬取想看的视频,只要通过浏览器访问的数据都可以通过爬虫获取
3.爬虫的本质:
模拟浏览器打开网页,获取网页中我们想要的那部分数据
二.爬取数据
1.urllib模块使用
import urllib.request
import urllib.parse
#解析baidu网页源码并进行utf-8解码,get请求
response = urllib.request.urlopen( "http://www.baidu.com" )
print(response.read().decode("utf-8"))
#获取一个post请求,其中封装data数据,使用utf8解码
data = bytes(urllib.parse.urlencode({"hello":"world"}),encoding="utf-8")
response = urllib.request.urlopen("http://httpbin.org/post",data=data)
print(response.read().decode("utf-8"))
#超时处理
try:
response = urllib.request.urlopen("http://httpbin.org/get",timeout=1)
print(response.read().decode("utf-8"))
except urllib.error.URLError as e:
print("time out")
#获取响应码/头部
response = urllib.request.urlopen( "http://www.baidu.com" )
print(response.status)
print(response.getheaders())
#爬取豆瓣信息,使用浏览器信息
url = "http://www.douban.com"
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
}
#data = bytes(urllib.parse.urlencode({"name":"eric"}),encoding="utf-8")
req = urllib.request.Request(url=url,headers=headers,method="POST")
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
2.实例-数据获取
#得到指定一个URL的网页内容
def askURl(url):
head={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"}
request = urllib.request.Request(url,headers=head)
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
return html
三.解析数据
1.BeauifulSoup模块
#!/usr/bin/python3
# @DESC:BeatuifulSoup4将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象,所有对象可以回归为4种:Tag,NavigableString,BeautifulSoup,Comment
import re
from bs4 import BeautifulSoup
file = open("./baidu.html","rb")
html = file.read().decode("utf-8")
bs = BeautifulSoup(html,"html.parser")
#1.Tag标签及其内容,拿到他找到的第一个内容
print(bs.title) #打印title
print(bs.a) #打印a标签
print(bs.head) #打印head标签
# 2.NavigableString拿到标签的内容
print(bs.title.string) #打印title中字符串
print(bs.a.attrs) #打印标签内所有属性
print(bs.a.string) #打印标签内字符串
# 3.BeautifulSoup,表示整个文档
print(bs.name)
print(bs.attrs)
print(bs)
# 4.comment,是一个特殊的NavigableString,输出内容不包含注释
# 5.文档的遍历
print(bs.head.contents)
print(bs.head.contents[0])
# 6文档搜索
# 6.1 find_all() 字符串过滤:会查找于字符串你完全匹配的内容
t_list = bs.find_all("a") #查找所有的a标签
print(t_list)
# 6.2正则表达式搜索:使用search()方法来匹配内容
t_list = bs.find_all(re.compile("a"))
print(t_list)
# 6.3根据函数的要求来搜索
def name_is_exists(tag):
return tag.has_attr("name")
t_list = bs.find_all(name_is_exists);
for item in t_list:
print(item)
# 6.3.kwargs 参数
t_list = bs.find_all(id="head",name=True,limit=3)
#t_list = bs.find_all(text="贴吧")
for item in t_list:
print(item)
# 6.4选择器
t_list = bs.select('title') #通过标签查找
t_list = bs.select('#u1') #通过id查找
t_list = bs.select(".mnav") #通过类名查找
t_list = bs.select("a[class]") #通过类名查找
t_list = bs.select(".mnav ~ .bri") #查看兄弟节点
for item in t_list:
print(item)
2.re模块
import re
#创建模式对象-search
pat = re.compile("AA") #此处的AA是正则表达式,用来验证其他字符串
m = pat.search("CBA") #search字符串为被校验的内容
m = pat.search("ABCAA") #search字符串为被校验的内容
m = pat.search("BAACABCAA") #search字符串为被校验的内容
print(m) #打印返回第一次匹配的字符串中下标,左闭右开
#无模式对象-search
m = re.search("asd","Aasd") #前字符串为模板,后字符串为被校验的对象
#print(m)
#查找所有符合标准的字符串,返回列表
print(re.findall("a","ASDaDEFGAa")) #前字符串为模板,后字符串为被校验的对象
print(re.findall("[A-Z]","ASDaDEFGAa")) #返回大写字母
print(re.findall("[A-Z]+","ASDaDEFGAa")) #符合的字母一次性输出
#匹配符合调整的内容(.*?)
print(re.findall("AS(.*?)Aa","ASDaDEFGAa"))
#sub 正则替换
print(re.sub("a","A","abcdcasd")) #找到a用A来替换,在第三个字符串中查找
print(re.sub("\n","","ab\ndca\nsd")) #去除换行
#建议在正则表达式中,被比较的字符串前面加上r,不用担心转义
3.实例-数据解析
#创建正则表达式对象,表示规则(字符串的模式)
findLink = re.compile(r'<a href="(.*?)">') #影片链接匹配规则
findImgSrc = re.compile(r'<img.*src="(.*?)"',re.S) #re.S忽略换行符,图片链接匹配规则
findTitle = re.compile(r'<span class=".*">(.*?)</span>') #匹配影片名
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>') #匹配评分规则
fingCommentNum = re.compile(r'<span>(\d*?)人评价</span>') #匹配评价人数
findInq = re.compile(r'<span class="inq">(.*?)</span>') #匹配一句话评价
findBD = re.compile(r'<p class="">(.*?)</p>',re.S) #匹配相关内容
#爬取网页
def getData(baseurl):
datalist = []
#2.逐一解析数据
for i in range(10): #调用获取页面信息的函数10次
url = baseurl + str(i*25)
html = askURl(url)
#2.解析数据
soup = BeautifulSoup(html,"html.parser")
for item in soup.find_all("div",class_="item"): #查找符合要求的字符串,行为列表
#print(item) #测试查看电影item全部信息
data = [] #保存一部电影的所有信息
item=str(item)
# re库用来通过正则表达式查找指定的字符串
link=re.findall(findLink,item)[0] #查找超链接
data.append(link)
imgSrc=re.findall(findImgSrc,item)[0] #查找图像地址
data.append(imgSrc)
titles=re.findall(findTitle,item) #查找标题,可能多个
for i in range(0,3):
res = titles[i].replace("/","").replace(" ","").replace("\xa0","") #去掉无关符号
data.append(res)
rating = re.findall(findRating, item)[0] #查找评分
data.append(rating)
commentNum = re.findall(fingCommentNum, item)[0] #查找评分数量
data.append(commentNum)
inq = re.findall(findInq, item) #查找一句话评论
if len(inq) !=0:
inq = inq[0].replace(".","").replace(" ","").replace("。","") #去掉无关符号
data.append(inq)
else:
data.append("")
bd = re.findall(findBD, item)[0] #查找相关内容
bd = re.sub('<br(\s+)?/>(\s+)?>',"",bd) #去掉<br/>
bd = re.sub('/',"",bd)
bd = re.sub('\xa0',"",bd)
bd = re.sub(' ',"",bd)
data.append(bd.strip())
datalist.append(data) #把处理好的一部电影信息放入datalist
#print(datalist)
return datalist
四.保存数据
1.xlwt模块
import xlwt
workbook = xlwt.Workbook(encoding="utf-8") #创建workbook对象
worksheet = workbook.add_sheet('sheet1') #创建工作表
worksheet.write(0,0,'hello') #写入数据,第一行参数为行,第二行参数为列,第三行参数内容
workbook.save('student.xls') #保存数据表
2.sqlite3模块
import sqlite3
#1.打开或创建数据库文件
conn = sqlite3.connect("test.db")
#安装插件Database Navigator后重启pycharm即可
print("Opened database successfully")
c = conn.cursor() #获取游标
#2.创建表
sql_creatTabel = '''
create table if not exists company
(id int promary key not null,
name text not null,
age int not null,
address char(50),
salary real);
'''
c.execute(sql_creatTabel) #执行sql语句
conn.commit() #提交数据库操作
#conn.close() #关闭数据库连接
print("Creat table successfully")
#3.插入数据
sql_insertData1 = '''
insert into company(id,name,age,address,salary)
values(1,'张三',35,'南京',10000);
'''
sql_insertData2 = '''
insert into company(id,name,age,address,salary)
values(2,'李四',27,'北京',15000);
'''
c.execute(sql_insertData1)
c.execute(sql_insertData2)
conn.commit() #提交数据库操作
print("Insert Data successfully")
#4.查询数据
sql_queryData = ' select * from company '
cursor = c.execute(sql_queryData)
for row in cursor:
print("id=",row[0],end="")
print("name=",row[1],end="")
print("address=",row[2],end="")
print("salary=",row[3],end="\n")
print("Query Data successfully")
conn.close()
3.实例-数据xls
#保存数据
def saveData(datalist,savepath):
print("save......")
book = xlwt.Workbook(encoding="utf8",style_compression=0)
sheet = book.add_sheet("豆瓣电影Top250",cell_overwrite_ok=True)
col = ('电影详情链接',"图片链接","名片1","名片2","名片3","评分","评价数","概括","相关信息")
for i in range(9):
sheet.write(0,i,col[i]) #列名
for i in range(250):
print("第%d条"%(i+1))
data = datalist[i]
for j in range(0,9):
sheet.write(i+1,j,data[j])
book.save(savepath) #保存
4.实例-数据保存DB
#数据库初始化
def init_db(dbpath):
sql = '''
create table if not exists movie250(
id integer primary key autoincrement,
info_link text,
pic_link text,
name1 varchar,
name2 varchar,
name3 varchar,
score numeric,
rated numeric,
instroduction text,
info text
)
''' #创建数据表
conn = sqlite3.connect(dbpath)
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
conn.close()
#保存数据入DB
def saveData2DB(datalist, dbpath):
init_db(dbpath)
conn=sqlite3.connect(dbpath)
cur = conn.cursor()
for data in datalist:
for index in range(len(data)):
if index ==5 or index ==6:
continue
data[index] = '"'+data[index]+'"'
sql = '''
insert into movie250(
info_link,pic_link,name1,name2,name3,score,rated,instroduction,info)
values(%s)'''%",".join(data)
#print(sql)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
五.完整源码
#!/usr/bin/python3
# -*- coding:utf-8 -*-
# @Time:2021/8/21 11:43
# @author: Mrwhite
# @File:spiderdouban250.py
# @DESC:
from bs4 import BeautifulSoup #网页解析,获取数据
import re #正则表达式 进行文字匹配
import urllib.request,urllib.error #制定URL,获取网页数据
import xlwt #进行excel操作
import sqlite3 #进行数据库操作
def main():
#xx电影250基础url
baseurl = "https://movie.douban.com/top250?start="
#1-2.爬取网页并解析
datalist=getData(baseurl)
savepath = "豆瓣电影Top250.xls"
dbpath = "movie.db"
#3.保存数据
#saveData(datalist,savepath)
saveData2DB(datalist,dbpath)
#创建正则表达式对象,表示规则(字符串的模式)
findLink = re.compile(r'<a href="(.*?)">') #影片链接匹配规则
findImgSrc = re.compile(r'<img.*src="(.*?)"',re.S) #re.S忽略换行符,图片链接匹配规则
findTitle = re.compile(r'<span class=".*">(.*?)</span>') #匹配影片名
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>') #匹配评分规则
fingCommentNum = re.compile(r'<span>(\d*?)人评价</span>') #匹配评价人数
findInq = re.compile(r'<span class="inq">(.*?)</span>') #匹配一句话评价
findBD = re.compile(r'<p class="">(.*?)</p>',re.S) #匹配相关内容
#爬取网页
def getData(baseurl):
datalist = []
#2.逐一解析数据
for i in range(10): #调用获取页面信息的函数10次
url = baseurl + str(i*25)
html = askURl(url)
#2.解析数据
soup = BeautifulSoup(html,"html.parser")
for item in soup.find_all("div",class_="item"): #查找符合要求的字符串,行为列表
#print(item) #测试查看电影item全部信息
data = [] #保存一部电影的所有信息
item=str(item)
# re库用来通过正则表达式查找指定的字符串
link=re.findall(findLink,item)[0] #查找超链接
data.append(link)
imgSrc=re.findall(findImgSrc,item)[0] #查找图像地址
data.append(imgSrc)
titles=re.findall(findTitle,item) #查找标题,可能多个
for i in range(0,3):
res = titles[i].replace("/","").replace(" ","").replace("\xa0","") #去掉无关符号
data.append(res)
rating = re.findall(findRating, item)[0] #查找评分
data.append(rating)
commentNum = re.findall(fingCommentNum, item)[0] #查找评分数量
data.append(commentNum)
inq = re.findall(findInq, item) #查找一句话评论
if len(inq) !=0:
inq = inq[0].replace(".","").replace(" ","").replace("。","") #去掉无关符号
data.append(inq)
else:
data.append("")
bd = re.findall(findBD, item)[0] #查找相关内容
bd = re.sub('<br(\s+)?/>(\s+)?>',"",bd) #去掉<br/>
bd = re.sub('/',"",bd)
bd = re.sub('\xa0',"",bd)
bd = re.sub(' ',"",bd)
data.append(bd.strip())
datalist.append(data) #把处理好的一部电影信息放入datalist
#print(datalist)
return datalist
#得到指定一个URL的网页内容
def askURl(url):
head={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"}
request = urllib.request.Request(url,headers=head)
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
return html
#保存数据
def saveData(datalist,savepath):
print("save......")
book = xlwt.Workbook(encoding="utf8",style_compression=0)
sheet = book.add_sheet("豆瓣电影Top250",cell_overwrite_ok=True)
col = ('电影详情链接',"图片链接","名片1","名片2","名片3","评分","评价数","概括","相关信息")
for i in range(9):
sheet.write(0,i,col[i]) #列名
for i in range(250):
print("第%d条"%(i+1))
data = datalist[i]
for j in range(0,9):
sheet.write(i+1,j,data[j])
book.save(savepath) #保存
#数据库初始化
def init_db(dbpath):
sql = '''
create table if not exists movie250(
id integer primary key autoincrement,
info_link text,
pic_link text,
name1 varchar,
name2 varchar,
name3 varchar,
score numeric,
rated numeric,
instroduction text,
info text
)
''' #创建数据表
conn = sqlite3.connect(dbpath)
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
conn.close()
#保存数据入DB
def saveData2DB(datalist, dbpath):
init_db(dbpath)
conn=sqlite3.connect(dbpath)
cur = conn.cursor()
for data in datalist:
for index in range(len(data)):
if index ==5 or index ==6:
continue
data[index] = '"'+data[index]+'"'
sql = '''
insert into movie250(
info_link,pic_link,name1,name2,name3,score,rated,instroduction,info)
values(%s)'''%",".join(data)
#print(sql)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
if __name__ == "__main__": #当程序执行时
#调用函数
main()
print("爬取完毕")