介绍 浏览知乎时感觉一个问题的回答不断刷新很麻烦,于是写了一个批量爬取。

注意

输入的链接形式必须是直达问题,而不是某一回答。

同时由于获取下一页回答的链接在上一页中,因此使用单线程递归获取。

在保存数据的时候,设置docx中的一些格式未能生效,原因不知。



代码块

import os import time import requests import re import json import logging from functools import wraps import docx from docx import Document from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.oxml.ns import qn from docx.shared import Inches

更改最大递归次数

import sys sys.setrecursionlimit(3000)

def retry(exception=Exception, tries=3, delay=1, logger=logging): ''' 重试装饰器 :param exception: 异常类型 :param tries: 重试次数 :param delay: 重试间隔 :param logger: 日志对象 :return: ''' def decorator(f): @wraps(f) def wrapper(*args, **kwargs): _tries = tries while _tries > 1: try: return f(*args, **kwargs) except exception as e: _tries -= 1 if logger: logger.error(e) time.sleep(delay) return f(*args, **kwargs) return wrapper return decorator

class ZhiHu: def init(self, url): self.headers = { 'Host': 'www.zhihu.com', 'User-Agent': 'Mozilla/5.0' } self.url = url self.answers = []

def get_first(self):
    res = requests.get(self.url, headers=self.headers)

    # 获取下一页链接
    next = re.findall(r'"next":"(http.+?)",', res.text)[0].encode('utf-8').decode("unicode_escape")

    # 解析数据
    data = re.findall(r'<script id="js-initialData" type="text/json">(.+?)</script>', res.text)[0]
    data = json.loads(data)['initialState']['entities']

    # 获取问题信息
    question = list(data['questions'].values())[0]
    title = question['title']
    question_url = question['url'].replace('/api/v4/questions/', '/question/')
    answer_count = question['answerCount']
    author = question['author']['name']
    author_url = question['author']['url'].replace('/api/v4', '')

    self.questions = {
        'title': title,
        'url': question_url,
        'answerCount': answer_count,
        'author': author,
        'authorUrl': author_url,
    }

    # 获取第一页回答
    answer = data['answers']
    for ans in answer:
        name = answer[ans]['author']['name']
        name_url = answer[ans]['author']['url'].replace('/api/v4', '')
        content = answer[ans]['content']
        content = re.sub(r'<figure.*?>.+</figure.*?>', '', content)

        text = re.findall(r'<p.*?>(.*?)</p.*?>', content)

        text = '\n'.join(text).replace('<br/>', '\n')
        text = text.replace('"', '"')
        imgs = re.findall(r'src="(http.+?)"', content)
        self.answers.append({'author': name, 'url': name_url, 'content': text, 'imgs': list(set(imgs))})

    return next


@retry()
def get_later(self, url):
    headers = self.headers.copy()
    headers['Referer'] = self.url
    res = requests.get(url, headers=headers).json()
     
    # 解析基本信息
    is_end = res['paging']['is_end']
    next = res['paging']['next']
    page = res['paging']['page']
     
    # 解析回答数据
    datas = res['data']
    for data in datas:
        name = data['target']['author']['name']
        name_url = data['target']['author']['url'].replace('/api/v4', '')
        content = data['target']['content']
        content = re.sub(r'<figure.*?>.+</figure.*?>', '', content)

        text = re.findall(r'<p.*?>(.*?)</p.*?>', content)

        text = '\n'.join(text).replace('<br/>', '\n')
        imgs = re.findall(r'src="(http.+?)"', content)
        self.answers.append({'author': name, 'url': name_url, 'content': text, 'imgs': list(set(imgs))})
    print(f'\r已获取到{page}页。', end='')
    if is_end or page == self.num:
        return
     
    time.sleep(0.2)
    self.get_later(next)


def main(self):
    next = self.get_first()
    print('\n已获取到第1页。')
    print(f'\n共有{self.questions["answerCount"]}条回答。')
    self.num = int(input('\n想要获取多少页(每页5条回答,输入0获取全部):'))
    if self.num != 1:
        self.get_later(next)
     
    return {'question': self.questions, 'answer': self.answers}

docx文本加超链接

def add_hyperlink(paragraph, url, text, color, underline): """ A function that places a hyperlink within a paragraph object.

:param paragraph: The paragraph we are adding the hyperlink to.
:param url: A string containing the required url
:param text: The text displayed for the url
:return: The hyperlink object
"""

# This gets access to the document.xml.rels file and gets a new relation id value
part = paragraph.part
r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)

# Create the w:hyperlink tag and add needed values
hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )

# Create a w:r element
new_run = docx.oxml.shared.OxmlElement('w:r')

# Create a new w:rPr element
rPr = docx.oxml.shared.OxmlElement('w:rPr')

# Add color if it is given
if not color is None:
    c = docx.oxml.shared.OxmlElement('w:color')
    c.set(docx.oxml.shared.qn('w:val'), color)
    rPr.append(c)

# Remove underlining if it is requested
if not underline:
    u = docx.oxml.shared.OxmlElement('w:u')
    u.set(docx.oxml.shared.qn('w:val'), 'none')
    rPr.append(u)

# Join all the xml elements together add add the required text to the w:r element
new_run.append(rPr)
new_run.text = text
hyperlink.append(new_run)

paragraph._p.append(hyperlink)

return hyperlink

@retry() def save_docx(data): document = Document() document.styles['Normal'].font.name = u'宋体' document.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')

# 获取问题信息,设置标题
questions = data['question']
title = questions['title']
author = questions['author']
answerCount = questions['answerCount']
url = questions['url']
authorUrl = questions['authorUrl']

title_1 = document.add_heading()
title_1.alignment = WD_ALIGN_PARAGRAPH.CENTER

title_2 = document.add_heading(level=2)
title_2.alignment = WD_ALIGN_PARAGRAPH.CENTER

hyperlink_1 = add_hyperlink(title_1, url, title, 'eb1515', False)
hyperlink_2 = add_hyperlink(title_2, authorUrl, author, '856e14', False)

# 写入回答
answers = data['answer']
for answer in answers:
    name = answer['author']
    name_url = answer['url']
    content = answer['content']
    imgs = answer['imgs']

    title_ = document.add_heading(level=2)
    hyperlink = add_hyperlink(title_, name_url, name, '1b8755', False)

    p = document.add_paragraph(content)
    p.first_line_indent = Inches(-0.25)
    p_ = p.add_run()
    p_.bold = True

    for num, img in enumerate(imgs):
        pic = document.add_paragraph()
        add_hyperlink(pic, img, f'图片{num + 1}', '3a44cf', False)
document.save('zhihu.docx')

def main(): url = input('输入知乎问题链接:') zhihu = ZhiHu(url)

res = zhihu.main()
print()

# import pprint
# pprint.pp(res['question'])
# pprint.pp(res['answer'][:3])
# print('\n以上是前三条回答。')
print(f"共获取到{len(res['answer'])}条回答。")

with open('zhihu.json', 'w') as f:
    json.dump(res, f)
print('\n文件已保存为zhihu.json')

print('\n正在将数据写入zhihu.docx')
try:
    with open('zhihu.json', 'r') as f:
        data = json.load(f)
    save_docx(data)
    print('保存成功。')
    print('\n正在删除zhihu.json文件')
    os.remove('zhihu.json')
    print('已删除zhihu.json')

except Exception as e:
    print('保存失败。')
    print(e)
    return
 
input()

if name == 'main': main()