import requests
import time
from bs4 import BeautifulSoup
import json
# 必要的库
def get_html(url):
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) appleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
# 模拟访问信息
r = requests.get(url, timeout=30, headers=headers)
r.raise_for_status()
r.endcodding = 'utf-8'
return r.text
def get_content(url):
comments = []
html = get_html(url)
try:
s = json.loads(html)
except:
print("jsonload error")
num = len(s['data']['replies']) # 获取每页评论栏的数量
i = 0
while i < num:
comment = s['data']['replies'][i] # 获取每栏信息
InfoDict = {} # 存储每组信息字典
InfoDict['用户名'] = comment['member']['uname']
InfoDict['uid号'] = comment['member']['mid']
InfoDict['评论内容'] = comment['content']['message']
InfoDict['性别'] = comment['member']['sex']
comments.append(InfoDict)
i+=1
return comments
def Out2File(dict):
with open('评论区爬取.txt', 'a+', encoding='utf-8') as f:
for user in dict:
try:
f.write('姓名:{}\t uid:{}\t 性别:{}\t \n 评论内容:{}\t \n'.format(user['用户名'], user['uid号'], user['性别'], user['评论内容']))
except:
print("out2File error")
print('当前页面保存完成')
e = 0
page = 1
while e == 0:
url = "https://api.bilibili.com/x/v2/reply/main?&jsonp=jsonp&next=" + str(page) + "&type=1&oid=677870443&mode=3&plat=1&_=1641278727643"
try:
print()
content = get_content(url)
print("page:", page)
Out2File(content)
page = page + 1
# 为了降低被封ip的风险,每爬10页便歇5秒。
if page % 10 == 0: # 求余数
time.sleep(5)
except:
e = 1
参考视频:https://www.bilibili.com/video/BV1fu411d7Hy?from=search&seid=3483579157564497530&spm_id_from=333.337.0.0