知乎问答下载为docx

原创

mb6458c7c141f47 2023-05-08 18:17:32 ©著作权

文章标签 perl json xml 文章分类 办公效率

©著作权归作者所有：来自51CTO博客作者mb6458c7c141f47的原创作品，请联系作者获取转载授权，否则将追究法律责任

介绍浏览知乎时感觉一个问题的回答不断刷新很麻烦，于是写了一个批量爬取。

注意

输入的链接形式必须是直达问题，而不是某一回答。

同时由于获取下一页回答的链接在上一页中，因此使用单线程递归获取。

在保存数据的时候，设置docx中的一些格式未能生效，原因不知。

代码块

import os import time import requests import re import json import logging from functools import wraps import docx from docx import Document from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.oxml.ns import qn from docx.shared import Inches

更改最大递归次数

import sys sys.setrecursionlimit(3000)

def retry(exception=Exception, tries=3, delay=1, logger=logging): ''' 重试装饰器 :param exception: 异常类型 :param tries: 重试次数 :param delay: 重试间隔 :param logger: 日志对象 :return: ''' def decorator(f): @wraps(f) def wrapper(*args, **kwargs): _tries = tries while _tries > 1: try: return f(*args, **kwargs) except exception as e: _tries -= 1 if logger: logger.error(e) time.sleep(delay) return f(*args, **kwargs) return wrapper return decorator

class ZhiHu: def init(self, url): self.headers = { 'Host': 'www.zhihu.com', 'User-Agent': 'Mozilla/5.0' } self.url = url self.answers = []

def get_first(self):
    res = requests.get(self.url, headers=self.headers)

    # 获取下一页链接
    next = re.findall(r'"next":"(http.+?)",', res.text)[0].encode('utf-8').decode("unicode_escape")

    # 解析数据
    data = re.findall(r'<script id="js-initialData" type="text/json">(.+?)</script>', res.text)[0]
    data = json.loads(data)['initialState']['entities']

    # 获取问题信息
    question = list(data['questions'].values())[0]
    title = question['title']
    question_url = question['url'].replace('/api/v4/questions/', '/question/')
    answer_count = question['answerCount']
    author = question['author']['name']
    author_url = question['author']['url'].replace('/api/v4', '')

    self.questions = {
        'title': title,
        'url': question_url,
        'answerCount': answer_count,
        'author': author,
        'authorUrl': author_url,
    }

    # 获取第一页回答
    answer = data['answers']
    for ans in answer:
        name = answer[ans]['author']['name']
        name_url = answer[ans]['author']['url'].replace('/api/v4', '')
        content = answer[ans]['content']
        content = re.sub(r'<figure.*?>.+</figure.*?>', '', content)

        text = re.findall(r'<p.*?>(.*?)</p.*?>', content)

        text = '\n'.join(text).replace('<br/>', '\n')
        text = text.replace('"', '"')
        imgs = re.findall(r'src="(http.+?)"', content)
        self.answers.append({'author': name, 'url': name_url, 'content': text, 'imgs': list(set(imgs))})

    return next


@retry()
def get_later(self, url):
    headers = self.headers.copy()
    headers['Referer'] = self.url
    res = requests.get(url, headers=headers).json()
     
    # 解析基本信息
    is_end = res['paging']['is_end']
    next = res['paging']['next']
    page = res['paging']['page']
     
    # 解析回答数据
    datas = res['data']
    for data in datas:
        name = data['target']['author']['name']
        name_url = data['target']['author']['url'].replace('/api/v4', '')
        content = data['target']['content']
        content = re.sub(r'<figure.*?>.+</figure.*?>', '', content)

        text = re.findall(r'<p.*?>(.*?)</p.*?>', content)

        text = '\n'.join(text).replace('<br/>', '\n')
        imgs = re.findall(r'src="(http.+?)"', content)
        self.answers.append({'author': name, 'url': name_url, 'content': text, 'imgs': list(set(imgs))})
    print(f'\r已获取到{page}页。', end='')
    if is_end or page == self.num:
        return
     
    time.sleep(0.2)
    self.get_later(next)


def main(self):
    next = self.get_first()
    print('\n已获取到第1页。')
    print(f'\n共有{self.questions["answerCount"]}条回答。')
    self.num = int(input('\n想要获取多少页(每页5条回答，输入0获取全部)：'))
    if self.num != 1:
        self.get_later(next)
     
    return {'question': self.questions, 'answer': self.answers}

docx文本加超链接

def add_hyperlink(paragraph, url, text, color, underline): """ A function that places a hyperlink within a paragraph object.

:param paragraph: The paragraph we are adding the hyperlink to.
:param url: A string containing the required url
:param text: The text displayed for the url
:return: The hyperlink object
"""

# This gets access to the document.xml.rels file and gets a new relation id value
part = paragraph.part
r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)

# Create the w:hyperlink tag and add needed values
hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )

# Create a w:r element
new_run = docx.oxml.shared.OxmlElement('w:r')

# Create a new w:rPr element
rPr = docx.oxml.shared.OxmlElement('w:rPr')

# Add color if it is given
if not color is None:
    c = docx.oxml.shared.OxmlElement('w:color')
    c.set(docx.oxml.shared.qn('w:val'), color)
    rPr.append(c)

# Remove underlining if it is requested
if not underline:
    u = docx.oxml.shared.OxmlElement('w:u')
    u.set(docx.oxml.shared.qn('w:val'), 'none')
    rPr.append(u)

# Join all the xml elements together add add the required text to the w:r element
new_run.append(rPr)
new_run.text = text
hyperlink.append(new_run)

paragraph._p.append(hyperlink)

return hyperlink

@retry() def save_docx(data): document = Document() document.styles['Normal'].font.name = u'宋体' document.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')

# 获取问题信息，设置标题
questions = data['question']
title = questions['title']
author = questions['author']
answerCount = questions['answerCount']
url = questions['url']
authorUrl = questions['authorUrl']

title_1 = document.add_heading()
title_1.alignment = WD_ALIGN_PARAGRAPH.CENTER

title_2 = document.add_heading(level=2)
title_2.alignment = WD_ALIGN_PARAGRAPH.CENTER

hyperlink_1 = add_hyperlink(title_1, url, title, 'eb1515', False)
hyperlink_2 = add_hyperlink(title_2, authorUrl, author, '856e14', False)

# 写入回答
answers = data['answer']
for answer in answers:
    name = answer['author']
    name_url = answer['url']
    content = answer['content']
    imgs = answer['imgs']

    title_ = document.add_heading(level=2)
    hyperlink = add_hyperlink(title_, name_url, name, '1b8755', False)

    p = document.add_paragraph(content)
    p.first_line_indent = Inches(-0.25)
    p_ = p.add_run()
    p_.bold = True

    for num, img in enumerate(imgs):
        pic = document.add_paragraph()
        add_hyperlink(pic, img, f'图片{num + 1}', '3a44cf', False)
document.save('zhihu.docx')

def main(): url = input('输入知乎问题链接：') zhihu = ZhiHu(url)

res = zhihu.main()
print()

# import pprint
# pprint.pp(res['question'])
# pprint.pp(res['answer'][:3])
# print('\n以上是前三条回答。')
print(f"共获取到{len(res['answer'])}条回答。")

with open('zhihu.json', 'w') as f:
    json.dump(res, f)
print('\n文件已保存为zhihu.json')

print('\n正在将数据写入zhihu.docx')
try:
    with open('zhihu.json', 'r') as f:
        data = json.load(f)
    save_docx(data)
    print('保存成功。')
    print('\n正在删除zhihu.json文件')
    os.remove('zhihu.json')
    print('已删除zhihu.json')

except Exception as e:
    print('保存失败。')
    print(e)
    return
 
input()

if name == 'main': main()