介绍 浏览知乎时感觉一个问题的回答不断刷新很麻烦,于是写了一个批量爬取。
注意
输入的链接形式必须是直达问题,而不是某一回答。
同时由于获取下一页回答的链接在上一页中,因此使用单线程递归获取。
在保存数据的时候,设置docx中的一些格式未能生效,原因不知。
代码块
import os import time import requests import re import json import logging from functools import wraps import docx from docx import Document from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.oxml.ns import qn from docx.shared import Inches
更改最大递归次数
import sys sys.setrecursionlimit(3000)
def retry(exception=Exception, tries=3, delay=1, logger=logging): ''' 重试装饰器 :param exception: 异常类型 :param tries: 重试次数 :param delay: 重试间隔 :param logger: 日志对象 :return: ''' def decorator(f): @wraps(f) def wrapper(*args, **kwargs): _tries = tries while _tries > 1: try: return f(*args, **kwargs) except exception as e: _tries -= 1 if logger: logger.error(e) time.sleep(delay) return f(*args, **kwargs) return wrapper return decorator
class ZhiHu: def init(self, url): self.headers = { 'Host': 'www.zhihu.com', 'User-Agent': 'Mozilla/5.0' } self.url = url self.answers = []
def get_first(self):
res = requests.get(self.url, headers=self.headers)
# 获取下一页链接
next = re.findall(r'"next":"(http.+?)",', res.text)[0].encode('utf-8').decode("unicode_escape")
# 解析数据
data = re.findall(r'<script id="js-initialData" type="text/json">(.+?)</script>', res.text)[0]
data = json.loads(data)['initialState']['entities']
# 获取问题信息
question = list(data['questions'].values())[0]
title = question['title']
question_url = question['url'].replace('/api/v4/questions/', '/question/')
answer_count = question['answerCount']
author = question['author']['name']
author_url = question['author']['url'].replace('/api/v4', '')
self.questions = {
'title': title,
'url': question_url,
'answerCount': answer_count,
'author': author,
'authorUrl': author_url,
}
# 获取第一页回答
answer = data['answers']
for ans in answer:
name = answer[ans]['author']['name']
name_url = answer[ans]['author']['url'].replace('/api/v4', '')
content = answer[ans]['content']
content = re.sub(r'<figure.*?>.+</figure.*?>', '', content)
text = re.findall(r'<p.*?>(.*?)</p.*?>', content)
text = '\n'.join(text).replace('<br/>', '\n')
text = text.replace('"', '"')
imgs = re.findall(r'src="(http.+?)"', content)
self.answers.append({'author': name, 'url': name_url, 'content': text, 'imgs': list(set(imgs))})
return next
@retry()
def get_later(self, url):
headers = self.headers.copy()
headers['Referer'] = self.url
res = requests.get(url, headers=headers).json()
# 解析基本信息
is_end = res['paging']['is_end']
next = res['paging']['next']
page = res['paging']['page']
# 解析回答数据
datas = res['data']
for data in datas:
name = data['target']['author']['name']
name_url = data['target']['author']['url'].replace('/api/v4', '')
content = data['target']['content']
content = re.sub(r'<figure.*?>.+</figure.*?>', '', content)
text = re.findall(r'<p.*?>(.*?)</p.*?>', content)
text = '\n'.join(text).replace('<br/>', '\n')
imgs = re.findall(r'src="(http.+?)"', content)
self.answers.append({'author': name, 'url': name_url, 'content': text, 'imgs': list(set(imgs))})
print(f'\r已获取到{page}页。', end='')
if is_end or page == self.num:
return
time.sleep(0.2)
self.get_later(next)
def main(self):
next = self.get_first()
print('\n已获取到第1页。')
print(f'\n共有{self.questions["answerCount"]}条回答。')
self.num = int(input('\n想要获取多少页(每页5条回答,输入0获取全部):'))
if self.num != 1:
self.get_later(next)
return {'question': self.questions, 'answer': self.answers}
docx文本加超链接
def add_hyperlink(paragraph, url, text, color, underline): """ A function that places a hyperlink within a paragraph object.
:param paragraph: The paragraph we are adding the hyperlink to.
:param url: A string containing the required url
:param text: The text displayed for the url
:return: The hyperlink object
"""
# This gets access to the document.xml.rels file and gets a new relation id value
part = paragraph.part
r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)
# Create the w:hyperlink tag and add needed values
hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )
# Create a w:r element
new_run = docx.oxml.shared.OxmlElement('w:r')
# Create a new w:rPr element
rPr = docx.oxml.shared.OxmlElement('w:rPr')
# Add color if it is given
if not color is None:
c = docx.oxml.shared.OxmlElement('w:color')
c.set(docx.oxml.shared.qn('w:val'), color)
rPr.append(c)
# Remove underlining if it is requested
if not underline:
u = docx.oxml.shared.OxmlElement('w:u')
u.set(docx.oxml.shared.qn('w:val'), 'none')
rPr.append(u)
# Join all the xml elements together add add the required text to the w:r element
new_run.append(rPr)
new_run.text = text
hyperlink.append(new_run)
paragraph._p.append(hyperlink)
return hyperlink
@retry() def save_docx(data): document = Document() document.styles['Normal'].font.name = u'宋体' document.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
# 获取问题信息,设置标题
questions = data['question']
title = questions['title']
author = questions['author']
answerCount = questions['answerCount']
url = questions['url']
authorUrl = questions['authorUrl']
title_1 = document.add_heading()
title_1.alignment = WD_ALIGN_PARAGRAPH.CENTER
title_2 = document.add_heading(level=2)
title_2.alignment = WD_ALIGN_PARAGRAPH.CENTER
hyperlink_1 = add_hyperlink(title_1, url, title, 'eb1515', False)
hyperlink_2 = add_hyperlink(title_2, authorUrl, author, '856e14', False)
# 写入回答
answers = data['answer']
for answer in answers:
name = answer['author']
name_url = answer['url']
content = answer['content']
imgs = answer['imgs']
title_ = document.add_heading(level=2)
hyperlink = add_hyperlink(title_, name_url, name, '1b8755', False)
p = document.add_paragraph(content)
p.first_line_indent = Inches(-0.25)
p_ = p.add_run()
p_.bold = True
for num, img in enumerate(imgs):
pic = document.add_paragraph()
add_hyperlink(pic, img, f'图片{num + 1}', '3a44cf', False)
document.save('zhihu.docx')
def main(): url = input('输入知乎问题链接:') zhihu = ZhiHu(url)
res = zhihu.main()
print()
# import pprint
# pprint.pp(res['question'])
# pprint.pp(res['answer'][:3])
# print('\n以上是前三条回答。')
print(f"共获取到{len(res['answer'])}条回答。")
with open('zhihu.json', 'w') as f:
json.dump(res, f)
print('\n文件已保存为zhihu.json')
print('\n正在将数据写入zhihu.docx')
try:
with open('zhihu.json', 'r') as f:
data = json.load(f)
save_docx(data)
print('保存成功。')
print('\n正在删除zhihu.json文件')
os.remove('zhihu.json')
print('已删除zhihu.json')
except Exception as e:
print('保存失败。')
print(e)
return
input()
if name == 'main': main()