微信公众号爬虫 python

转载

mob64ca14116c53 2024-09-14 22:13:20

文章标签 微信公众号爬虫 python python html 公众号微信公众号 文章分类 Python 后端开发

"""
    通过搜狗搜索中的微信搜索入口爬取微信公众号文章(selenium)
"""
import re
import os
import json
import time
import requests
from pyquery import PyQuery
from urllib.parse import quote
from selenium import webdriver


class WeChatSpider:
    def __init__(self, keywords):
        """构造函数"""
        self.keywords = keywords
        # 搜狐微信搜索链接入口
        self.so_gou_search_url = 'http://weixin.sogou.com/weixin?type=1&query=%s&ie=utf8&s_from=input&_sug_=n&_sug_type_=' % quote(self.keywords)
        # 爬虫伪装头部设置
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36'}
        # 设置操作超时时长
        self.timeout = 5
        # 爬虫模拟在一个request.session中完成
        self.s = requests.Session()

    def get_search_result_by_keywords(self):
        """搜索入口地址，以公众为关键字搜索该公众号"""
        self.log('搜索地址为：%s' % self.so_gou_search_url)
        return self.s.get(self.so_gou_search_url, headers=self.headers, timeout=self.timeout).content

    def get_wx_url_by_sougou_search_html(self, sougou_search_html):
        """根据返回sougou_search_html，从中获取公众号主页链接"""
        doc = PyQuery(sougou_search_html)
        return doc('div[class=txt-box]')('p[class=tit]')('a').attr('href')

    def get_selenium_js_html(self, url):
        """执行js渲染内容，并返回渲染后的html内容"""
        browser = webdriver.Chrome()
        browser.get(url)
        time.sleep(3)
        html = browser.execute_script("return document.documentElement.outerHTML")
        return html

    def parse_wx_articles_by_html(self, selenium_html):
        """从selenium_html中解析出微信公众号文章"""
        doc = PyQuery(selenium_html)
        print('开始查找内容msg')
        return doc('div[class="weui_media_box appmsg"]')

    def switch_arctiles_to_list(self, articles):
        """把articles转换成数据字典"""
        # 定义存贮变量
        articles_list = []
        i = 1
        # 遍历找到的文章，解析里面的内容
        if articles:
            for article in articles.items():
                self.log(u'开始整合(%d/%d)' % (i, len(articles)))
                articles_list.append(self.parse_one_article(article))
                i += 1
        return articles_list

    def parse_one_article(self, article):
        """解析单篇文章"""
        # 获取标题
        title = article('h4[class="weui_media_title"]').text()
        self.log('标题是： %s' % title)
        # 获取标题对应的地址
        url = 'http://mp.weixin.qq.com' + article('h4[class="weui_media_title"]').attr('hrefs')
        self.log('地址为： %s' % url)
        # 获取概要内容
        summary = article('.weui_media_desc').text()
        self.log('文章简述： %s' % summary)
        # 获取文章发表时间
        date = article('.weui_media_extra_info').text()
        self.log('发表时间为： %s' % date)
        # 获取封面图片
        pic = self.parse_cover_pic(article)
        # 获取文章内容
        content = self.parse_content_by_url(url).html()
        # 存储文章到本地
        contentfiletitle = self.keywords + '/' + title + '_' + date + '.html'
        self.save_content_file(contentfiletitle, content)
        # 返回字典数据
        return {
            'title': title,
            'url': url,
            'summary': summary,
            'date': date,
            'pic': pic,
            'content': content
        }

    def parse_cover_pic(self, article):
        """解析文章封面图片"""
        pic = article('.weui_media_hd').attr('style')
        p = re.compile(r'background-image:url\((.*?)\)')
        rs = p.findall(pic)
        self.log('封面图片是：%s ' % rs[0] if len(rs) > 0 else '')
        return rs[0] if len(rs) > 0 else ''

    def parse_content_by_url(self, url):
        """获取文章详情内容"""
        page_html = self.get_selenium_js_html(url)
        return PyQuery(page_html)('#js_content')

    def save_content_file(self, title, content):
        """页面内容写入文件"""
        with open(title, 'w', encoding="utf-8") as f:
            f.write(content)

    def save_file(self, content):
        """数据写入文件"""
        with open(self.keywords + '/' + self.keywords + '.txt', 'w', encoding='utf-8') as f:
            f.write(content)

    def log(self, msg):
        """自定义log函数，主要是加上时间"""
        print(u'%s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg))

    # 验证函数
    def need_verify(self, selenium_html):
        """有时候对方会封锁ip，这里做一下判断，检测html中是否包含id=verify_change的标签，有的话，代表被重定向了，提醒过一阵子重试 """
        return PyQuery(selenium_html)('#verify_change').text() != ''

    def create_dir(self):
        """创建公众号命名的文件夹"""
        if not os.path.exists(self.keywords):
            os.makedirs(self.keywords)

    def run(self):
        """爬虫入口函数 """
        # Step 0：创建公众号命名的文件夹
        self.create_dir()

        # Step 1：GET请求到搜狗微信引擎，以微信公众号英文名称作为查询关键字
        self.log(u'开始获取，微信公众号英文名为：%s' % self.keywords)
        self.log(u'开始调用sougou搜索引擎')
        sougou_search_html = self.get_search_result_by_keywords()

        # Step 2：从搜索结果页中解析出公众号主页链接
        self.log(u'获取sougou_search_html成功，开始抓取公众号对应的主页wx_url')
        wx_url = self.get_wx_url_by_sougou_search_html(sougou_search_html)
        self.log(u'获取wx_url成功，%s' % wx_url)

        # Step 3：Selenium获取渲染后的html
        self.log(u'开始调用selenium渲染html')
        selenium_html = self.get_selenium_js_html(wx_url)

        # Step 4: 检测目标网站是否进行了封锁
        if self.need_verify(selenium_html):
            self.log(u'爬虫被目标网站封锁，请稍后再试')
        else:
            # Step 5: 使用PyQuery，从Step 3获取的html中解析出公众号文章列表的数据
            self.log(u'调用selenium渲染html完成，开始解析公众号文章')
            articles = self.parse_wx_articles_by_html(selenium_html)
            self.log(u'抓取到微信文章%d篇' % len(articles))

            # Step 6: 把微信文章数据封装成字典的list
            self.log(u'开始整合微信文章数据为字典')
            articles_list = self.switch_arctiles_to_list(articles)

            # Step 7: 把Step 5的字典list转换为Json
            self.log(u'整合完成，开始转换为json')
            data_json = json.dumps(articles_list)

            # Step 8: 写文件
            self.log(u'转换为json完成，开始保存json数据到文件')
            self.save_file(data_json)

            self.log(u'保存完成，程序结束')


if __name__ == '__main__':
    print(
        """
    ******************************************
    **    Welcome to Spider of 公众号       ** 
    **      Created on 2018-11-15           ** 
    **       @author: Feng mujin            ** 
    ******************************************
    """)

    Subscription = input(u'输入要爬取的公众号: ')
    if not Subscription:
        Subscription = 'python6359'
    WeChatSpider(Subscription).run()

import re
import time
import json
import random
import requests
from selenium import webdriver

# 微信公众号账号
user = "你的公众号账号"
# 公众号密码
password = "你的公众号密码"
# 设置要爬取的公众号列表
gzlist = ['要爬取的公众号名字']


# 登录微信公众号，获取登录之后的cookies信息，并保存到本地文本中
def wechat_login():
    # 定义一个空的字典，存放cookies内容
    post = {}

    # 用webdriver启动谷歌浏览器
    print("启动浏览器，打开微信公众号登录界面")
    driver = webdriver.Chrome()
    # 打开微信公众号登录页面
    driver.get('https://mp.weixin.qq.com/')
    # 等待5秒钟
    time.sleep(5)
    print("正在输入微信公众号登录账号和密码......")
    # 清空账号框中的内容
    driver.find_element_by_xpath("./*//input[@id='account']").clear()
    # 自动填入登录用户名
    driver.find_element_by_xpath("./*//input[@id='account']").send_keys(user)
    # 清空密码框中的内容
    driver.find_element_by_xpath("./*//input[@id='pwd']").clear()
    # 自动填入登录密码
    driver.find_element_by_xpath("./*//input[@id='pwd']").send_keys(password)

    # 在自动输完密码之后需要手动点一下记住我
    print("请在登录界面点击:记住账号")
    time.sleep(10)
    # 自动点击登录按钮进行登录
    driver.find_element_by_xpath("./*//a[@id='loginBt']").click()
    # 拿手机扫二维码！
    print("请拿手机扫码二维码登录公众号")
    time.sleep(20)
    print("登录成功")
    # 重新载入公众号登录页，登录之后会显示公众号后台首页，从这个返回内容中获取cookies信息
    driver.get('https://mp.weixin.qq.com/')
    # 获取cookies
    cookie_items = driver.get_cookies()

    # 获取到的cookies是列表形式，将cookies转成json形式并存入本地名为cookie的文本中
    for cookie_item in cookie_items:
        post[cookie_item['name']] = cookie_item['value']
    cookie_str = json.dumps(post)
    with open('cookie.txt', 'w+', encoding='utf-8') as f:
        f.write(cookie_str)
    print("cookies信息已保存到本地")


# 爬取微信公众号文章，并存在本地文本中
def get_content(query):
    # query为要爬取的公众号名称
    # 公众号主页
    url = 'https://mp.weixin.qq.com'
    # 设置headers
    header = {
        "HOST": "mp.weixin.qq.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36"
    }

    # 读取上一步获取到的cookies
    with open('cookie.txt', 'r', encoding='utf-8') as f:
        cookie = f.read()
    cookies = json.loads(cookie)

    # 登录之后的微信公众号首页url变化为：https://mp.weixin.qq.com/cgi-bin/home?t=home/index&lang=zh_CN&token=1849751598，从这里获取token信息
    response = requests.get(url=url, cookies=cookies)
    token = re.findall(r'token=(\d+)', str(response.url))[0]

    # 搜索微信公众号的接口地址
    search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
    # 搜索微信公众号接口需要传入的参数，有三个变量：微信公众号token、随机数random、搜索的微信公众号名字
    query_id = {
        'action': 'search_biz',
        'token': token,
        'lang': 'zh_CN',
        'f': 'json',
        'ajax': '1',
        'random': random.random(),
        'query': query,
        'begin': '0',
        'count': '5'
    }
    # 打开搜索微信公众号接口地址，需要传入相关参数信息如：cookies、params、headers
    search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
    # 取搜索结果中的第一个公众号
    lists = search_response.json().get('list')[0]
    # 获取这个公众号的fakeid，后面爬取公众号文章需要此字段
    fakeid = lists.get('fakeid')

    # 微信公众号文章接口地址
    appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
    # 搜索文章需要传入几个参数：登录的公众号token、要爬取文章的公众号fakeid、随机数random
    query_id_data = {
        'token': token,
        'lang': 'zh_CN',
        'f': 'json',
        'ajax': '1',
        'random': random.random(),
        'action': 'list_ex',
        'begin': '0',  # 不同页，此参数变化，变化规则为每页加5
        'count': '5',
        'query': '',
        'fakeid': fakeid,
        'type': '9'
    }
    # 打开搜索的微信公众号文章列表页
    appmsg_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
    # 获取文章总数
    max_num = appmsg_response.json().get('app_msg_cnt')
    # 每页至少有5条，获取文章总的页数，爬取时需要分页爬
    num = int(int(max_num) / 5)
    # 起始页begin参数，往后每页加5
    begin = 0
    while num + 1 > 0:
        query_id_data = {
            'token': token,
            'lang': 'zh_CN',
            'f': 'json',
            'ajax': '1',
            'random': random.random(),
            'action': 'list_ex',
            'begin': '{}'.format(str(begin)),
            'count': '5',
            'query': '',
            'fakeid': fakeid,
            'type': '9'
        }
        print('正在翻页：--------------', begin)

        # 获取每一页文章的标题和链接地址，并写入本地文本中
        query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
        fakeid_list = query_fakeid_response.json().get('app_msg_list')
        for item in fakeid_list:
            content_link = item.get('link')
            content_title = item.get('title')
            fileName = query + '.txt'
            with open(fileName, 'a', encoding='utf-8') as fh:
                fh.write(content_title + ":\n" + content_link + "\n")
        num -= 1
        begin = int(begin)
        begin += 5
        time.sleep(2)


if __name__ == '__main__':
    try:
        # 登录微信公众号，获取登录之后的cookies信息，并保存到本地文本中
        wechat_login()
        # 登录之后，通过微信公众号后台提供的微信公众号文章接口爬取文章
        for query in gzlist:
            # 爬取微信公众号文章，并存在本地文本中
            print("开始爬取公众号：" + query)
            get_content(query)
            print("爬取完成")
    except Exception as e:
        print(str(e))

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。