import requests

import time

from bs4 import BeautifulSoup

import json

# 必要的库



def get_html(url):

headers = {

'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) appleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',

}

# 模拟访问信息
r = requests.get(url, timeout=30, headers=headers)

r.raise_for_status()

r.endcodding = 'utf-8'

return r.text



def get_content(url):

comments = []

html = get_html(url)

try:

s = json.loads(html)

except:

print("jsonload error")



num = len(s['data']['replies']) # 获取每页评论栏的数量



i = 0

while i < num:

comment = s['data']['replies'][i] # 获取每栏信息

InfoDict = {} # 存储每组信息字典
InfoDict['用户名'] = comment['member']['uname']

InfoDict['uid号'] = comment['member']['mid']

InfoDict['评论内容'] = comment['content']['message']

InfoDict['性别'] = comment['member']['sex']

comments.append(InfoDict)

i+=1
return comments

def Out2File(dict):
with open('评论区爬取.txt', 'a+', encoding='utf-8') as f:
for user in dict:
try:

f.write('姓名:{}\t uid:{}\t 性别:{}\t \n 评论内容:{}\t \n'.format(user['用户名'], user['uid号'], user['性别'], user['评论内容']))



except:

print("out2File error")

print('当前页面保存完成')





e = 0

page = 1

while e == 0:

url = "https://api.bilibili.com/x/v2/reply/main?&jsonp=jsonp&next=" + str(page) + "&type=1&oid=677870443&mode=3&plat=1&_=1641278727643"


try:

print()

content = get_content(url)

print("page:", page)
Out2File(content)

page = page + 1



# 为了降低被封ip的风险,每爬10页便歇5秒。
if page % 10 == 0: # 求余数

time.sleep(5)
except:
e = 1


b站评论区爬取_html



参考视频:https://www.bilibili.com/video/BV1fu411d7Hy?from=search&seid=3483579157564497530&spm_id_from=333.337.0.0