#定位到2022必看热片
#提取子页面连接地址
#拿到想要的下载地址
import re
import requests
import csv


header={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36"
}

url="https://dytt89.com/"
requ=requests.get(url,headers=header)
requ.encoding="gb2312"
f = open("电影天堂.csv", mode="a", encoding="utf-8", newline='')
csvwriter=csv.writer(f)
#电影天堂网页用的是这个gb2312,utf-8会乱码
print(requ.text)
obj1=re.compile(r'<span style="float:left;">综艺&动漫.*?<ul>(?P<ul>.*?)</ul>',re.S)
obj2=re.compile(r"<li><a href='(?P<link>.*?)' title=.*?2022年(?P<name>.*?)</a><span>",re.S)
obj3=re.compile(r'<img alt="" src="(?P<image>.*?)" style=.*?译  名 (?P<tit>.*?)<br />.*?<td style="WORD-WRAP:.*?<a href="(?P<link2>.*?)">magnet',re.S)
result=obj1.finditer(requ.text)
herf_list=[]
for i in result:
ul=i.group("ul")
#print(i.group("ul"))
result2=obj2.finditer(ul)
for i in result2:
#拼接子页面url
herf=url+i.group("link").strip("/")
herf_list.append(herf) #把子页面列表列举出来
#print(herf)
print(i.group("name"))

for j in herf_list:
requst=requests.get(j,headers=header)
requst.encoding="gb2312"
print(requst.text)
rew=obj3.finditer(requst.text)
for s in rew:
print(s.group("tit"))
dic=s.groupdict()
csvwriter.writerow(dic.values())

print("over")

使用python爬取界面列表子页面链接,然后根据列表的子页面链接,爬取电影天堂的电影的链接和图片存储起来

python爬取b站所有动漫简介和电影天堂下载链接_动漫


 繁体是因为这些字页面使用的是繁体的表示方法

根据这一思路,我爬取了b站所有动漫的子页面的简介,下面附上代码

#定位到动漫列表
#提取子页面连接地址
#拿到想要的下载地址

import requests
import re
import csv
wq=1
while(wq<163):
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36"
}

dat={
"season_version":"-1",
"spoken_language_type":"-1",
"area":"-1",
"is_finish":"-1",
"copyright":"-1",
"season_status":"-1",
"season_month":"-1",
"year":"-1",
"style_id":"-1",
"order":"4",
"st":"1",
"sort":"0",
"page":f"{wq}",
"season_type":"1",
"pagesize":"20",
"type":"1"
}
f=open("动漫简介.csv",mode="a",encoding="utf-8",newline='')
csvwriter=csv.writer(f)
url="https://api.bilibili.com/pgc/season/index/result/#"
wq = wq + 1
requ=requests.get(url,headers=header,params=dat)
print(requ.text)
obj1=re.compile(r'"link":"(?P<link>.*?)","media_id"',re.S)
obj2=re.compile(r'<meta property="og:title" content="(?P<title>.*?)"><meta property.*?:image" content=".*?"><meta name=".*?itemprop="description" content="(?P<jianjie>.*?)"><meta it',re.S)
result=obj1.finditer(requ.text)
link_list=[]
for i in result:
dis=i.group("link")
print(dis)
link_list.append(dis)
for j in link_list:
print(j)
requ1=requests.get(j,headers=header,params=dat)
#print(requ1.text)
result1=obj2.finditer(requ1.text)
for k in result1:
print(k.group("title"))
print(k.group("jianjie"))
dic=k.groupdict().values()
csvwriter.writerow(dic)

#break #测试用

python爬取b站所有动漫简介和电影天堂下载链接_动漫_02


 动画是按照评分的顺序