方式一,使用requests库
import urllib
import requests
from bs4 import BeautifulSoup # 第三方包,处理html对象
import re
def search_google(query):
# 爬取网页html源码
url = 'https://google.com/search?q=' + urllib.parse.quote(query)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'}
response = requests.get(url, headers=headers)
# 使用BeautifulSoup解析html对象,并使用正则表达式查找目标内容
results = []
soup = BeautifulSoup(response.text, 'html.parser')
for item in soup.find('div', attrs={'class': 'v7W49e'}).find_all('h3', attrs={'class': 'LC20lb MBeuO DKV0Md'}):
string = str(item.string)
res = re.findall(r'(?P<name>.+?) - (?P<work>.+?) - (?P<company>SES Satellites) \| LinkedIn', string, flags=re.I | re.S)
if len(res) > 0:
results.extend(res)
return results
if __name__ == '__main__':
results = search_google(query='site:linkedin.com -inurl:dir "at ses Satellites" "Current"')
print(results)
方式二,只是用urllib库
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import re
def search_google(query):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
# 爬取网页html源码
url = 'https://www.google.com.hk/search?' + urllib.parse.urlencode({'q': query})
req = urllib.request.Request(url, headers=headers) # 填写data时为POST方式,否则将以GET方式请求
response = urllib.request.urlopen(req)
html = response.read().decode()
# 使用BeautifulSoup解析html对象,并使用正则表达式查找目标内容
results = []
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find('div', attrs={'class': 'v7W49e'}).find_all('h3', attrs={'class': 'LC20lb MBeuO DKV0Md'}):
string = str(item.string)
res = re.findall(r'(?P<name>.+?) - (?P<work>.+?) - (?P<company>SES Satellites) \| LinkedIn', string, flags=re.I | re.S)
if len(res) > 0:
results.extend(res)
return results
if __name__ == '__main__':
results = search_google(query='site:linkedin.com -inurl:dir "at ses Satellites" "Current"')
print(results)