目标:爬取电影网站的电影详情并保存到mysql数据库中
一.设计好数据库和表字段,连接数据库
连接数据库
#数据库连接
def sql(moviename,years,ziyuan,daoyan,zhuyan,leixing,address,yuyan,pianchang,othername,pdianji,pingfen,jieshao,src):
#本机数据库地址
db = pymysql.connect(host='host地址', user='账号', password='密码', db='数据库')
# 使用cursor()方法创建一个游标对象cursor
cursor = db.cursor()
# 数据库操作
# 定义一个格式化的sql语句
sql = 'INSERT INTO `pydb`.`movies`(`moviename`, `years`, `ziyuan`, `daoyan`, `zhuyan`, `leixing`, `address`, `yuyan`, `pianchang`, `othername`, `pdianji`, `pingfen`, `jieshao`,`src`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
# 准备数据
data = (moviename,years,ziyuan,daoyan,zhuyan,leixing,address,yuyan,pianchang,othername,pdianji,pingfen,jieshao,src)
# 操作
try:
# 执行sql语句
cursor.execute(sql, data)
# 提交
db.commit()
print('正在插入数据:--'+str(moviename))
except Exception as e:
print('插入--'+str(moviename)+'--数据失败', e)
# 如果出错执行回滚
db.rollback() # 回滚
# 关闭游标
cursor.close()
# 关闭连接
db.close()
二.先尝试保存一页的数据到数据库中
#电影的详情
def moviexiangqing():
url='https://www.piaku.cc/k/tiejiagangquan/'
response = requests.get(url=url, headers=header)
html = etree.HTML(response.text)
# 电影名
name = html.xpath("//div[@class='main-ui-meta']/h1/text()")
# 电影发布的时间
year = html.xpath("//div[@class='main-ui-meta']/h1/span/text()")
year = str(''.join(year))
year = year.replace('(', '')
year = year.replace(')', '')
# 当前资源
ziyuan = html.xpath("//div[@class='otherbox']/em[1]/text()")
# 导演
daoyan = html.xpath("//div[@class='main-ui-meta']/div[2]/a/text()")
daoyan=str(','.join(daoyan))
# 主演,列表转str类型
zhuyan = html.xpath("//div[@class='main-ui-meta']/div[3]/a/text()")
zhuyan = str(','.join(zhuyan))
# 电影类型
leixing = html.xpath("//div[@class='main-ui-meta']/div[4]/a/text()")
leixing = str(','.join(leixing))
# 地区
address = html.xpath("//div[@class='main-ui-meta']/div[5]/a/text()")
address = str(','.join(address))
# 语言
yuyan = html.xpath("//div[@class='main-ui-meta']/div[6]/a/text()")
yuyan=str(','.join(yuyan))
# 片长
pianchang = html.xpath("//div[@class='main-ui-meta']/div[8]/text()")
# 又名
othername = html.xpath("//div[@class='main-ui-meta']/div[9]/text()")
# 片库点击量
pdianji = html.xpath("//div[@class='main-ui-meta']/div[10]/text()")
# 评分,含有个数符号特殊符号\t \r \n,进行处理转换
pingfen = html.xpath("//div[@class='main-ui-meta']/div[11]/span[2]/text()")
pingfen = ''.join(pingfen) # 转为str类型
pingfen = pingfen.strip()
# 剧情介绍
jieshao = html.xpath("//div[@class='movie-introduce']/p[2]/text()")
#图片地址
src=html.xpath("//div[@class='img']/img/@src")
#保存数据库
sql(name, year, ziyuan, daoyan, zhuyan, leixing, address, yuyan, pianchang, othername, pdianji, pingfen, jieshao,src)
遇到的问题 1.xpath提取数据是结果为"Element a at 0x5308a80",2.提取的数据含有\t\r\n空格换行符等, 3.列表数据转字符串
三.最后完整爬取整个页面的数据,在通过for循环选择要爬取具体多少页的数据
完整代码
import pymysql
import requests
from fake_useragent import UserAgent
from lxml import etree
#
ua=UserAgent()
header={
'user-agent':ua.chrome
}
#数据库连接
def sql(moviename,years,ziyuan,daoyan,zhuyan,leixing,address,yuyan,pianchang,othername,pdianji,pingfen,jieshao,src):
#本机数据库地址
db = pymysql.connect(host='host地址', user='账号', password='密码', db='数据库')
# 使用cursor()方法创建一个游标对象cursor
cursor = db.cursor()
# 数据库操作
# 定义一个格式化的sql语句
sql = 'INSERT INTO `pydb`.`movies`(`moviename`, `years`, `ziyuan`, `daoyan`, `zhuyan`, `leixing`, `address`, `yuyan`, `pianchang`, `othername`, `pdianji`, `pingfen`, `jieshao`,`src`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
# 准备数据
data = (moviename,years,ziyuan,daoyan,zhuyan,leixing,address,yuyan,pianchang,othername,pdianji,pingfen,jieshao,src)
# 操作
try:
# 执行sql语句
cursor.execute(sql, data)
# 提交
db.commit()
print('正在插入数据:--'+str(moviename))
except Exception as e:
print('插入--'+str(moviename)+'--数据失败', e)
# 如果出错执行回滚
db.rollback() # 回滚
# 关闭游标
cursor.close()
# 关闭连接
db.close()
#电影的详情
def moviexiangqing(url):
response = requests.get(url=url, headers=header)
html = etree.HTML(response.text)
# 电影名
name = html.xpath("//div[@class='main-ui-meta']/h1/text()")
# 电影发布的时间
year = html.xpath("//div[@class='main-ui-meta']/h1/span/text()")
year = str(''.join(year))
year = year.replace('(', '')
year = year.replace(')', '')
# 当前资源
ziyuan = html.xpath("//div[@class='otherbox']/em[1]/text()")
# 导演
daoyan = html.xpath("//div[@class='main-ui-meta']/div[2]/a/text()")
daoyan=str(','.join(daoyan))
# 主演,列表转str类型
zhuyan = html.xpath("//div[@class='main-ui-meta']/div[3]/a/text()")
zhuyan = str(','.join(zhuyan))
# 电影类型
leixing = html.xpath("//div[@class='main-ui-meta']/div[4]/a/text()")
leixing = str(','.join(leixing))
# 地区
address = html.xpath("//div[@class='main-ui-meta']/div[5]/a/text()")
address = str(','.join(address))
# 语言
yuyan = html.xpath("//div[@class='main-ui-meta']/div[6]/a/text()")
yuyan=str(','.join(yuyan))
# 片长
pianchang = html.xpath("//div[@class='main-ui-meta']/div[8]/text()")
# 又名
othername = html.xpath("//div[@class='main-ui-meta']/div[9]/text()")
# 片库点击量
pdianji = html.xpath("//div[@class='main-ui-meta']/div[10]/text()")
# 评分,含有个数符号特殊符号\t \r \n,进行处理转换
pingfen = html.xpath("//div[@class='main-ui-meta']/div[11]/span[2]/text()")
pingfen = ''.join(pingfen) # 转为str类型
pingfen = pingfen.strip()
# 剧情介绍
jieshao = html.xpath("//div[@class='movie-introduce']/p[2]/text()")
#图片地址
src=html.xpath("//div[@class='img']/img/@src")
#保存数据库
sql(name, year, ziyuan, daoyan, zhuyan, leixing, address, yuyan, pianchang, othername, pdianji, pingfen, jieshao,src)
#具体页的数据
def movie(num):
# 请求电影页面
url = 'https://www.piaku.cc/p/DY-'+str(num)+'/'
response = requests.get(url, headers=header)
# 解析
html = etree.HTML(response.text)
# 提取
src = html.xpath("//div[@class='li-img']/a/@href")
srcs = []
# 返回url进行拼接处理
for i in src:
srcs.append('https://www.piaku.cc/' + i)
for i in srcs:
moviexiangqing(i)
#输入要爬取页面的数据
for i in range(1,int(input('输入你要爬取的页数:'))):
print('+++++++++++++++正在爬取第'+str(i)+'页数据+++++++++++++++')
movie(i)
print('+++++++++++++++第'+str(i)+'页数据爬取成功+++++++++++++++')
四.效果展示