两个简单的版本,关于百度搜索结果的采集抓取,可以获取到竞争对手的网站,加以分析和研究,只需输入关键词和搜索页码,即可完成对于竞争对手的获取和研究,给出两个版本,希望可以起到参考和帮助!
版本一
特点
cookies读取,随机选取一个访问网页
导出结果排除了百度自家产品
excel导出数据
简单多线程案例可参考
#百度搜索结果抓取
#author/微信:huguo00289
# -*- coding: utf-8 -*-
import requests,time,random
from fake_useragent import UserAgent
from lxml import etree
import threading
import xlsxwriter
class Baidu_search():
def __init__(self):
self.url="https://www.baidu.com/s?wd="
self.ua=UserAgent()
self.search_datas=[]
#获取cookies
def get_cookies(self):
with open("cookie.txt", "r", encoding="utf-8") as f:
cookies = f.readlines()
cookie=random.choice(cookies)
cookie=cookie.strip()
return cookie
#获取搜索结果
def get_search_objects(self,search_url):
headers={
"User-Agent":self.ua.random,
'Cookie':self.get_cookies(),
}
html=requests.get(search_url,headers=headers,timeout=8).content.decode("utf-8")
time.sleep(2)
req=etree.HTML(html)
h3s=req.xpath('//div[@class="result c-container new-pmd"]/h3[@class="t"]/a')
hrefs=req.xpath('//div[@class="result c-container new-pmd"]/h3[@class="t"]/a/@href')
for h3,href in zip(h3s,hrefs):
h3=h3.xpath('.//text()')
h3=''.join(h3)
href=self.get_website_url(href)
data=h3,href
self.search_datas.append(data)
print(data)
# 获取真实地址
def get_website_url(self,baidu_url):
r = requests.head(baidu_url, stream=True)
website_url = r.headers['Location']
# print(website_url)
return website_url
#插入excel
def write_to_xlsx(self, file_name):
workbook = xlsxwriter.Workbook(f'{file_name}_{time.strftime("%Y-%m-%d ", time.localtime())}.xlsx') # 创建一个Excel文件
worksheet = workbook.add_worksheet(file_name)
title = ['标题', '网址'] # 表格title
worksheet.write_row('A1', title)
for index, data in enumerate(self.search_datas):
# content = content.rstrip()
# keyword, rank, include_num, chart_url, title, game_id, company_num, long_words_num = data
num0 = str(index + 2)
row = 'A' + num0
# data = [name, size, game_id]
worksheet.write_row(row, data)
workbook.close()
print("搜索结果数据插入excel表格成功!")
def main(self,keyword,num):
for i in range(0, num):
print(f'正在查询第{i+1}页百度搜索结果数据..')
ym = i * 10
search_url = f"{self.url}{keyword}&ie=UTF-8&pn={ym}"
self.get_search_objects(search_url)
self.write_to_xlsx(keyword)
#多线程
def Thread_main(self,keyword,num):
threadings=[]
for i in range(0, num):
print(f'正在查询第{i+1}页百度搜索结果数据..')
ym = i * 10
search_url = f"{self.url}{keyword}&ie=UTF-8&pn={ym}"
t=threading.Thread(target=self.get_search_objects,args=(search_url,))
threadings.append(t)
t.start()
for x in threadings:
x.join()
print("多线程查询百度搜索结果完成")
print(self.search_datas)
if __name__=='__main__':
keyword="工业设计"
num=10
spider=Baidu_search()
spider.main(keyword,num)
#spider.Thread_main(keyword, num)
版本二
特点
cookies 固定,不可变
数据几乎全部导出,排名也已经写入
#关键词百度搜索结果查询
#20191121 by 微信:huguo00289
# -*- coding: UTF-8 -*-
import requests,time
import urllib.parse
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import json
def ua():
ua = UserAgent()
return ua.random
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie':Cookie ,
'Host': 'www.baidu.com',
'Referer': 'https://www.baidu.com/?tn=48021271_6_hao_pg',
'Upgrade-Insecure-Requests': '1',
'User-Agent':ua()
#'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
}
#获取百度跳转真实网址
def get_trueurl(url):
try:
r = requests.head(url, stream=True)
zsurl = r.headers['Location']
except:
zsurl=url
return zsurl
#获取网页信息
def get_response(url):
"""
#代理ip
proxy = '120.83.105.195:9999'
proxies = {
'http': 'http://' + proxy,
'https': 'https://' + proxy
}
response=requests.get(url,headers=ua(),proxies=proxies,timeout=10)"""
#response = requests.get(url, headers=ua(),timeout=10)
response = requests.get(url, headers=headers, timeout=10)
print(f'状态码:{response.status_code}')
time.sleep(2)
response.encoding='utf-8'
req=response.text
return req
#查询搜索结果
def get_bdpm(keyword,num):
"""
#转换为utf-8编码
key_word = urllib.parse.quote(keyword)
print(key_word)
"""
for i in range(0,int(num)):
print(f'正在查询{i + 1}页搜索结果...')
ym=i * 10
url=f"https://www.baidu.com/s?wd={keyword}&ie=UTF-8&pn={ym}"
#print(url)
req=get_response(url)
#print(req)
soup=BeautifulSoup(req,'lxml')
divs=soup.find('div',id="content_left").find_all('div')
for div in divs:
if 'class="result'in str(div):
try:
pm=div['id']
except:
pm=''
title=div.find('a').get_text()
title=title.strip()
href=div.find('a')['href']
zsurl=get_trueurl(href)
print(pm,title,zsurl)
time.sleep(5)
if __name__ == '__main__':
while True:
keyword =input('请输入要查询的关键词:')
num = input('请输入要查询的页码数:')
try:
get_bdpm(keyword,num)
except IndexError as e:
print(e)
print("查询结果失败!")
微信公众号:二爷记
不定时分享python源码及工具