以下内容主要实现爬取疫情专题热门文章的评论过程
1、需求分析
热门文章对应的评论字段:评论用户id,评论用户名,评论用户地址,评论用户性别,用户评论,评论时间,文章id
其中用户性别以及用户地址需要在用户详情界面才能获取,而其他的字段则是在文章详情界面获取爬取热门文章下的前100条热门评论,之后用做情感分析使用
2、具体实现过程
注:在实现的过程出现了挺多的报错而中断了爬虫过程,在不断的更进下,报错率下降了不少。
2.1、模块的导入
import csv
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup
import json
import re
from requests.packages.urllib3.exceptions import InsecureRequestWarning, InsecurePlatformWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
requests.packages.urllib3.disable_warnings(InsecurePlatformWarning)
2.2、一些参数的设置
使用prox进行代理操作,貌似解决了requests.exceptions.SSLError: HTTPSConnectionPool(host=‘xxxx’, port=xxx): Max retries exceeded with url: xxxx (Caused by SSLError(SSLEOFError(8, ‘EOF occurred in violation of protocol (_ssl.c:847)’),))的报错问题
# 进行headers的配置
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36",
"cookie": 'SUB=自己的cookie'
}
prox = ''
2.3、获取评论的主函数
获取评论的url为https://weibo.com/aj/v6/comment/big,当进入文章详情界面的时候,该url携带的参数和之后携带的参数不一样,同时获取第二,三页采用的是ajax的方式获取,而之后的页数是通过点击加载更多的形式获取
# 定义一个方法获取第一次进入微博详情界面的html
def get_weibo_detail(start, end):
with open('./contents.csv', mode='r', newline='', encoding='utf-8-sig') as f:
reader = csv.reader(f)
contents_id = [] # 获取csv文件的第一列文件,这里的列把表头也包括进去了
for row in reader:
contents_id.append(row[0])
contents_id_part = contents_id[start:end] #这里只截取了一部分id,因此也是对一部分的热门文章进行评论的获取
for content_id in contents_id_part:
params = {
"ajwvr": "6",
"id": content_id,
"from": "singleWeiBo",
"__rnd": int(time.time() * 1000)
} # 第一次请求携带的参数
current_count = 0 # 初始化获取的评论数目
# 模拟发送请求并接收响应
res = requests.get(url='https://weibo.com/aj/v6/comment/big', params=params,proxies={'http': prox, 'https': prox}, headers=headers, verify=False)# verify=False可以避免ssl认证
res = json.loads(res.text)["data"]["html"]
soup = BeautifulSoup(res, "html.parser")
current_count = get_comments(soup, content_id, current_count) # 该方法返回的是累加的评论数
# 这里我只是按照我的需求获取100条一级评论,如果需要获取更多自行更改,如果需要获取二级评论等,要进行其它url的请求访问
while current_count <= 100:
print("当前count的数量为:", current_count)
more_soup = get_more_comments_link(soup) # 该方法返回的是获取更多评论时候,返回的内容并进行了html格式化,返回的内容包括了三种形式,下面具体的方法会说明
if more_soup != 0:
# 设置二层try except,解决了获取到的soup不为0,但又不是我们想要的soup以及第一层因为某些原因未能获取到想要的soup而导致报错中断运行的问题
try:
comments_count = get_comments(more_soup, content_id, current_count)
if current_count != comments_count: # 因为可能存在当你评论数小于100,但是你访问下一页,你会发现你获取的评论数始终为0,为了避免一直为0的死循环而导致的报错问题,当这次到的评论总数等于上次的评论总数的时候,说明这次的评论数为0,然后就退出while循环
current_count = comments_count
soup = more_soup
time.sleep(2)
else:
break
except:
try:
comments_count = get_comments(more_soup, content_id, current_count)
if current_count != comments_count:
current_count = comments_count
soup = more_soup
time.sleep(2)
else:
break
except:
print("获取失败")
else:
break
time.sleep(2)
2.4、获取评论详情
建议在看代码的时候可以自己登录微博,打开控制台中的elements,可审核每个标签元素
# 获取评论的详情信息
def get_comments(soup, content_id, current_count):
comment_list_soups = soup.find_all("div", attrs={"node-type": "root_comment"}) #获取每条一级评论包括的所有信息
comments_data = [] # 相当于二维数组列表,用来存储评论信息
for comment_list_soup in comment_list_soups:
data = []
info = comment_list_soup.find("div", class_="WB_text")
user_id = info.find("a")["usercard"][3:] # 获取评论用户的id,并截去了“id=”
res2 = requests.get(url="https://weibo.com/u/%s" % user_id, headers=headers,
proxies={'http': prox, 'https': prox}, verify=False) # 该请求主要是为了获取评论用户的地址以及性别
# 通过定位发现性别,地址信息全部都存放在script标签的FM.view中,直接通过soup方式获取是不可行的,可能各种查资料都没有结果,后面就用了很笨很笨的方法,成功获取
# 注明:地址获取比较明显,而性别是通过i标签代表的矢量图获取到的
res_soup = BeautifulSoup(res2.text, "html.parser")
res_scripts = res_soup.find_all("script") # 获取所有的script
# 初始化性别以及地址,后面做情感分析,对于通过地址或性别所做的分析,如果是其他,则不考虑该条评论
user_gender = "其他"
user_address = "其他"
# 解决获取script中的FM.view中的内容,也就是获取性别以及地址的信息
# 思路:script中的FM.view中到的内容为一个对象,但我们会得到str字符串,再用json.loads转化为json对象而对象的html属性里有我们需要的内容
for res_script in res_scripts:
res_script = re.sub('<script>FM.view\(', '', str(res_script))
res_script = re.sub('\)</script>', '', res_script)
if res_script.find('{"ns"') == 0: # 因为有些得到的str字符串中,转化为json对象后,不存在html属性,这里做出判断可以去除这类内容
res_script = json.loads(res_script)
if "html" in res_script:
res_script_soup = BeautifulSoup(res_script["html"], "html.parser")
res_script_male = res_script_soup.find("i", class_="W_icon icon_pf_male")
res_script_female = res_script_soup.find("i", class_="W_icon icon_pf_female")
if res_script_male != None:
user_gender = "男"
if res_script_female != None:
user_gender = "女"
res_script_infos = res_script_soup.find_all("li", class_="item S_line2 clearfix")
for all_info in res_script_infos:
icon = all_info.find("em", class_="W_ficon ficon_cd_place S_ficon")
if icon != None and icon.get_text() == '2':# '2'对应的是地址的图标,这样可以准确的取到想要的地址信息
user_address = all_info.find("span", class_="item_text W_fl").get_text().strip()
break
comment_info = re.compile(r'<[^>]+>', re.S)# 去除页面中的标签
comment_info = comment_info.sub('', info.text).strip().split(":") #用户名以及评论内容通过 中文的冒号分开,二级,三级评论那可能既有中文也有英文的冒号,需要注意
user_name = comment_info[0] # 用户名
comment = comment_info[1] # 评论数
comment_time = "2020年" + comment_list_soup.find("div", class_="WB_from S_txt2").get_text() # 发布时间
content_id = content_id # 话题ID
data.append(user_id)
data.append(user_name)
data.append(user_address)
data.append(user_gender)
data.append(comment)
data.append(comment_time)
data.append(content_id)
print(data)
current_count += 1
comments_data.append(data)
time.sleep(1)
print("总共的条数:", len(comments_data))
comments_header = ["user_id", "user_name", "user_address", "user_gender", "comment", "comment_time", "content_id"]
with open("comments3.csv", 'a', newline='', encoding='utf-8-sig') as t:
writer = csv.writer(t)
# writer.writerow(comments_header)
writer.writerows(comments_data)
return current_count
2.5、获取异步刷新或者点击后的评论
因为我选取的评论数为500以上的文章,因此异步刷新操作获取的评论有两次,其余的是点击后获取
# 获取点击或者下拉刷新后的评论
def get_more_comments_link(soup):
if soup.find("div", attrs={"node-type": "comment_loading"}) is not None:
more_comment_url = soup.find("div", attrs={"node-type": "comment_loading"})["action-data"] # 处理异步刷新获取更多的评论的url
elif soup.find("a", attrs={"action-type": "click_more_comment"}) is not None:
more_comment_url = soup.find("a", attrs={"action-type": "click_more_comment"})["action-data"] # 处理点击后获取更多评论的url
else:
return 0
# 获取的url整理成参数的形式最后存储在params对象中
params_keys = []
params_values = []
params_splits = more_comment_url.split('&')
for item in params_splits:
params_keys.append(item.split('=')[0])
params_values.append(item.split('=')[1])
params_infos = dict(zip(params_keys, params_values))
# print(params_infos)
params = {
"ajwvr": "6",
"id": params_infos["id"],
"root_comment_max_id": params_infos["root_comment_max_id"],
"root_comment_max_id_type": "0",
"root_comment_ext_param": "",
"page": params_infos["page"],
"filter": "hot",
"sum_comment_number": params_infos["sum_comment_number"],
"filter_tips_before": "1",
"from": "singleWeiBo",
"__rnd": int(time.time() * 1000)
}
more_res = requests.get(url='https://weibo.com/aj/v6/comment/big', params=params,
proxies={'http': prox, 'https': prox}, headers=headers, verify=False)
if more_res:
try:
more_res = json.loads(more_res.text)["data"]["html"]
return BeautifulSoup(more_res, "html.parser")
except:
print("获取more_res失败")
return 0
3、完整代码实现过程
import csv
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup
import json
import re
from requests.packages.urllib3.exceptions import InsecureRequestWarning, InsecurePlatformWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
requests.packages.urllib3.disable_warnings(InsecurePlatformWarning)
# 进行headers的配置
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36",
"cookie": 'SUB=自己的cookie'
}
prox = ''
# 定义一个方法获取第一次进入微博详情界面的html
def get_weibo_detail(start, end):
with open('./contents.csv', mode='r', newline='', encoding='utf-8-sig') as f:
reader = csv.reader(f)
contents_id = []
for row in reader:
contents_id.append(row[0])
contents_id_part = contents_id[start:end]
for content_id in contents_id_part:
params = {
"ajwvr": "6",
"id": content_id,
"from": "singleWeiBo",
"__rnd": int(time.time() * 1000)
}
current_count = 0
# 模拟发送请求并接收响应
res = requests.get(url='https://weibo.com/aj/v6/comment/big', params=params,
proxies={'http': prox, 'https': prox}, headers=headers, verify=False)
res = json.loads(res.text)["data"]["html"]
soup = BeautifulSoup(res, "html.parser")
current_count = get_comments(soup, content_id, current_count)
while current_count <= 100:
print("当前count的数量为:", current_count)
more_soup = get_more_comments_link(soup)
if more_soup != 0:
try:
comments_count = get_comments(more_soup, content_id, current_count)
if current_count != comments_count:
current_count = comments_count
soup = more_soup
time.sleep(2)
else:
break
except:
try:
comments_count = get_comments(more_soup, content_id, current_count)
if current_count != comments_count:
current_count = comments_count
soup = more_soup
time.sleep(2)
else:
break
except:
print("获取失败")
else:
break
time.sleep(2)
# 获取评论的详情信息
def get_comments(soup, content_id, current_count):
comment_list_soups = soup.find_all("div", attrs={"node-type": "root_comment"})
comments_data = []
for comment_list_soup in comment_list_soups:
data = []
info = comment_list_soup.find("div", class_="WB_text")
user_id = info.find("a")["usercard"][3:]
res2 = requests.get(url="https://weibo.com/u/%s" % user_id, headers=headers,
proxies={'http': prox, 'https': prox}, verify=False)
res_soup = BeautifulSoup(res2.text, "html.parser")
res_scripts = res_soup.find_all("script")
user_gender = "其他"
user_address = "其他"
# 解决获取script中的FM.view中的内容
for res_script in res_scripts:
res_script = re.sub('<script>FM.view\(', '', str(res_script))
res_script = re.sub('\)</script>', '', res_script)
if res_script.find('{"ns"') == 0:
res_script = json.loads(res_script)
if "html" in res_script:
res_script_soup = BeautifulSoup(res_script["html"], "html.parser")
res_script_male = res_script_soup.find("i", class_="W_icon icon_pf_male")
res_script_female = res_script_soup.find("i", class_="W_icon icon_pf_female")
if res_script_male != None:
user_gender = "男"
if res_script_female != None:
user_gender = "女"
res_script_infos = res_script_soup.find_all("li", class_="item S_line2 clearfix")
for all_info in res_script_infos:
icon = all_info.find("em", class_="W_ficon ficon_cd_place S_ficon")
if icon != None and icon.get_text() == '2':
user_address = all_info.find("span", class_="item_text W_fl").get_text().strip()
break
comment_info = re.compile(r'<[^>]+>', re.S)
comment_info = comment_info.sub('', info.text).strip().split(":")
user_name = comment_info[0] # 用户名
comment = comment_info[1] # 评论数
comment_time = "2020年" + comment_list_soup.find("div", class_="WB_from S_txt2").get_text() # 发布时间
content_id = content_id # 话题ID
data.append(user_id)
data.append(user_name)
data.append(user_address)
data.append(user_gender)
data.append(comment)
data.append(comment_time)
data.append(content_id)
print(data)
current_count += 1
comments_data.append(data)
time.sleep(1)
print("总共的条数:", len(comments_data))
comments_header = ["user_id", "user_name", "user_address", "user_gender", "comment", "comment_time", "content_id"]
with open("comments3.csv", 'a', newline='', encoding='utf-8-sig') as t:
writer = csv.writer(t)
# writer.writerow(comments_header)
writer.writerows(comments_data)
return current_count
# 获取点击或者下拉刷新后的评论
def get_more_comments_link(soup):
if soup.find("div", attrs={"node-type": "comment_loading"}) is not None:
more_comment_url = soup.find("div", attrs={"node-type": "comment_loading"})["action-data"]
elif soup.find("a", attrs={"action-type": "click_more_comment"}) is not None:
more_comment_url = soup.find("a", attrs={"action-type": "click_more_comment"})["action-data"]
else:
return 0
params_keys = []
params_values = []
params_splits = more_comment_url.split('&')
for item in params_splits:
params_keys.append(item.split('=')[0])
params_values.append(item.split('=')[1])
params_infos = dict(zip(params_keys, params_values))
# print(params_infos)
params = {
"ajwvr": "6",
"id": params_infos["id"],
"root_comment_max_id": params_infos["root_comment_max_id"],
"root_comment_max_id_type": "0",
"root_comment_ext_param": "",
"page": params_infos["page"],
"filter": "hot",
"sum_comment_number": params_infos["sum_comment_number"],
"filter_tips_before": "1",
"from": "singleWeiBo",
"__rnd": int(time.time() * 1000)
}
more_res = requests.get(url='https://weibo.com/aj/v6/comment/big', params=params,
proxies={'http': prox, 'https': prox}, headers=headers, verify=False)
if more_res:
try:
more_res = json.loads(more_res.text)["data"]["html"]
return BeautifulSoup(more_res, "html.parser")
except:
print("获取more_res失败")
return 0
start = 1
end = 100
get_weibo_detail(start, end)