python 爬取微信公众号

转载

mob64ca140a59b0 2024-07-16 15:30:35

文章标签 python 爬取微信公众号 python 爬虫正则表达式扩展名 文章分类 Python 后端开发

关键字：正则表达式 python 爬虫

调试不易，老程序员恳请：转载请注明出处。

近期由于搜集素材需求下载了一些图片。附代码如下，话先说好，如果读者需要使用，请节制好下载频率，被微信封号我不管哦。:)

下载资源首先从文件中获取，文件是文章的链接URL。

正则表达式分析文章，简单的写了几种图片方案。本程序前期主要用于下载公众号了，所以一开始的下载比较多的使用datasrc判断，微信以后可能也会改。没有使用selenium就已经完成下载了，以后再贴一个用selenium的。

代码的一些解释：

首先把html文档按照标签分解，没有按照soup的find直接搜索，因为我不确定微信的公众号网页是以什么方式嵌入的图片。有时候，编辑会使用<img class= ... data-type=... src=""...>来嵌入一个图片，有时候是src在前，<img class= ... src=""... data-type=...>

data-type这里是文件类型，可以由此判断文件的扩展名。

向文件logfile.log输出log，下载成功的链接在前面加succ字样，失败的链接在前面加fail字样。

# -*- coding: utf-8 -*-
"""
Created on Mon Dec 24 14:04:34 2018

@author: Thor
"""

#selenium爬取图片
#import sys
#from selenium import webdriver
import os
from datetime import datetime
from re import findall
import re
from pprint import pprint
from urllib.request import urlopen

def strIsNum(string):
    #全部是数字时返回True
    return re.match("[0-9]+$", string) != None


image_path = '.\\wechatimglu'

#print(os.curdir)
if not os.path.exists(image_path):
    #判断要放图片的目录是否存在:
    os.mkdir(image_path)
    print("true")

#把文件保存目录改为当前目录下面的image_path
try:
    os.chdir(image_path)
except Exception as e:
    print(e)
    print("Failed to change directory")

for root, dirs, files in os.walk("."):
    if root == ".":
#        print(root)
#        print(dirs)
#        print(files)
#获取image_path目录下的所有文件名，split和join用于将扩展名去掉。
        filelistWithoutExtName = [".".join(p.split(".")[:-1]) for p in files]

maxFileName = max(int(x) for x in filelistWithoutExtName if strIsNum(x) )
print(maxFileName)
while(True):
    try:
        print("please input your start index:")
        #读取并判断index是否在文件夹中已经存在，如果存在则返回此循环继续读取index
        strindex = input()
        jpgindex = int(strindex)
        filename = "%d.%s"%(jpgindex,"png")
        if not strindex in filelistWithoutExtName:
            #print(filelistWithoutExtName)
            print("the start index is:%d,press any key to run,or ctrl-c to exit."%jpgindex)
            tempchar = input()
            break #jpgindex can be converted into int
        else:
            print("The index has already existed, please change.\n\
                if you just enter, the index will start with %s"%maxFileName)
    except Exception as e:
        pass
#jpgindex = 161
urls = []
#website文件放在上一级目录里
with open("..\website.txt", "r") as filesource:
    for line in filesource.readlines():
        url = line.strip()
        urls.append(url)
        
print("Retrieving...")


for url in urls:
    #打开URL
    with urlopen(url) as fp:
        content = fp.read().decode("gbk",errors='ignore')   # 需要进行解码成字符串
        
    #print(content)
    
    #写URL日志
    with open("..\\filehtm.txt","w") as f:
        f.write(content)
        
    '''
    把文本按照标签分开,<html> </html>会被分到两个变量输入result
    再将结果输出到迭代器进行下载。
    '''
    resultDiv = findall("<(.+?)>", content)
    result = []
    for contentdiv in resultDiv:
        #pattern = 'data-src="(.+?)" data-type="jpeg" '
        pattern = 'data-src="(.+?)" data-type="(gif|png|jpeg|jpg)"'
        if re.search("data-type=\"(gif|png|jpeg|jpg)", contentdiv) != None:
            result += findall('data-src="(.+?)"', contentdiv)
        if re.search("data-src", contentdiv) == None:
            #没有找到数据 print("pppp",contentdiv)
            if re.search("img src=\"(.+?)\"",contentdiv) != None:
                result += [ "http:" + url for url in findall('img src="(.+?)"', contentdiv) if re.match("http", url) == None]
                result += findall('img src="(.+?)"', contentdiv)
            else:
                result += findall('img.+? src="(.+?)"', contentdiv)
                
        #pattern = 'data-type="png" data-w=".+?" data-src="(.+?)"'
        #pattern = 'data-type="jpeg|png".+?data-src="(.+?)"'
        #result = findall(pattern, content)

        pattern = 'img src="(.+?)" style'
        result += findall(pattern, contentdiv)
        if len(result) == 0:
            if re.search("data-src", contentdiv) != None:
                result+= findall('src="(.+?)"', contentdiv)
    
    pprint(result)
    
    '''结果输出到迭代器进行下载'''
    for index, item in enumerate(result,1):
        try:
            if type(item) is not str:
                item = item[0]
            data = urlopen(str(item)).read()
        except Exception as e:
            print(e)
            print(str(item))
            continue
        print('开始下载第%s 张图片： %s'%(index, item))
        #提取图片中的最后fmt字符 格式作为图片扩展名
        try:
            fixnameindex = item.index("fmt=")
        except Exception as e:
            fixnameindex = -1
        """x形如 ...fmt=png ...fmt=jpeg，index会取f所在的位置，
        所以下一句向后取4个位置(fmt=是4个字符)，一直到最后一个字符，如果index取到了-1，
        就是说没有找到fmt=的字样，那么默认在后面加jpg的扩展名"""
        fixname = item[fixnameindex +4 : ] if fixnameindex != -1 else "jpg"                
        #文件名生成
        filename = "%d.%s"%(jpgindex,fixname)
        #f = open(str(jpgindex) + fixname, "wb")
        f = open(filename, "wb")
        jpgindex += 1
        f.write(data)
        f.close()
    filelog = open("logfile.log", "a+")
    if len(result) != 0:
        filelog.write("succ|%s|%s\n"%(str(datetime.now()),url))
    else:
        filelog.write("fail|%s|%s\n"%(str(datetime.now()),url))
    filelog.close()

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。