python爬虫爬取谷歌搜索的结果，正则表达式查找目标内容

原创

wx646c1f410ed7d 2023-05-29 11:08:19 博主文章分类：编程：编程语言&框架 ©著作权

文章标签 python 爬虫正则表达式 html User 文章分类 Html/CSS 前端开发

©著作权归作者所有：来自51CTO博客作者wx646c1f410ed7d的原创作品，请联系作者获取转载授权，否则将追究法律责任

方式一，使用requests库

import urllib
import requests
from bs4 import BeautifulSoup  # 第三方包，处理html对象
import re


def search_google(query):
    # 爬取网页html源码
    url = 'https://google.com/search?q=' + urllib.parse.quote(query)
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'}
    response = requests.get(url, headers=headers)

    # 使用BeautifulSoup解析html对象，并使用正则表达式查找目标内容
    results = []
    soup = BeautifulSoup(response.text, 'html.parser')
    for item in soup.find('div', attrs={'class': 'v7W49e'}).find_all('h3', attrs={'class': 'LC20lb MBeuO DKV0Md'}):
        string = str(item.string)
        res = re.findall(r'(?P<name>.+?) - (?P<work>.+?) - (?P<company>SES Satellites) \| LinkedIn', string, flags=re.I | re.S)
        if len(res) > 0:
            results.extend(res)
    return results


if __name__ == '__main__':
    results = search_google(query='site:linkedin.com -inurl:dir "at ses Satellites" "Current"')
    print(results)

方式二，只是用urllib库

import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import re


def search_google(query):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}

    # 爬取网页html源码
    url = 'https://www.google.com.hk/search?' + urllib.parse.urlencode({'q': query})
    req = urllib.request.Request(url, headers=headers)  # 填写data时为POST方式，否则将以GET方式请求
    response = urllib.request.urlopen(req)
    html = response.read().decode()

    # 使用BeautifulSoup解析html对象，并使用正则表达式查找目标内容
    results = []
    soup = BeautifulSoup(html, 'html.parser')
    for item in soup.find('div', attrs={'class': 'v7W49e'}).find_all('h3', attrs={'class': 'LC20lb MBeuO DKV0Md'}):
        string = str(item.string)
        res = re.findall(r'(?P<name>.+?) - (?P<work>.+?) - (?P<company>SES Satellites) \| LinkedIn', string, flags=re.I | re.S)
        if len(res) > 0:
            results.extend(res)
    return results


if __name__ == '__main__':
    results = search_google(query='site:linkedin.com -inurl:dir "at ses Satellites" "Current"')
    print(results)

上一篇：母函数详解（定义，模板代码，用法）

下一篇：JetBrains Clion CmakeList.txt编写：运行多个cpp文件、解决定义大数组无法编译

提问和评论都可以，用心的回复会被更多人看到评论

发布评论

相关文章

官方博客	全部文章	热门标签	班级博客
了解我们	网站地图	意见反馈

鸿蒙开发者社区	51CTO学堂
51CTO	软考资讯