python百度关键词相关搜索采集,链轮查询采集相关关键词工具exe
1.随机生成协议头
2.关键词相关筛选
3.关键词去重
4.链轮采集
#百度关键词相关搜索采集
#20191118
# -*- coding: UTF-8 -*-
import requests,re,time
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
#随机生成协议头
def ua():
ua=UserAgent()
headers = {"User-Agent": ua.random}
return headers
#筛选词初始设定
req="工业|产品|外观|结构|造型|手机|犀牛|rhino|proe"
#筛选关键词
def search(req,con,n):
text=re.search(req,con)
if text:
data=text.group(n)
else:
data='no'
return data
#获取相关关键词源码
def get_a(key):
response = requests.get(f"https://www.baidu.com/s?ie=utf-8&tn=baidu&wd={key}", headers=ua(), timeout=5).text
time.sleep(2)
soup = BeautifulSoup(response, 'lxml')
div = soup.find('div', id='rs').find_all('a')
return div
#相关关键词链轮查询采集
def get_keywords(keywords):
xg_keywords=keywords
all_keywords=[]
for key in keywords:
print(">>>开始查询 %s 相关关键词!" % key)
try:
div=get_a(key)
except Exception as e:
print(f'错误代码:{e}')
print(f'正在重新获取网页内容...')
time.sleep(5)
div = get_a(key)
for a in div:
keyword=a.get_text()
print(keyword)
#筛选
if search(r'(%s)'%req,keyword,1) =='no':
print(f'-剔除关键词 {keyword}')
else:
print(f'>>获取关键词 {keyword}')
print(xg_keywords)
#去重
if keyword not in xg_keywords:
xg_keywords.append(keyword)
all_keywords.append(keyword)
#链轮
get_keywords(all_keywords)
if __name__ == '__main__':
get_keywords(["工业设计培训"])
代码参考来源:流量贩子 《seo应用编程》
版本二
百度相关搜索关键词抓取
1.读取txt文档关键词
2.导出txt关键词
3.多线程采集关键词
#百度相关搜索关键词抓取,读取txt关键词,导出txt关键词
# -*- coding=utf-8 -*-
import requests
import re
import time
from multiprocessing.dummy import Pool as ThreadPool
#百度相关关键词查询
def xgss(url):
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
html=requests.get(url,headers=headers).text
#print(html)
ze=r'<div id="rs"><div class="tt">相关搜索</div><table cellpadding="0">(.+?)</table></div>'
xgss=re.findall(ze,html,re.S)
#print(xgss)
xgze=r'<th><a href="(.+?)">(.+?)</a></th>'
sj=re.findall(xgze,str(xgss),re.S)
#print(sj)
gjc=''
for x in sj:
print(x[1])
gjc=gjc+x[1]+'\n'
# 导出关键词为txt文本
with open(".\gjcsj.txt", 'a', encoding='utf-8') as f:
f.write(gjc)
print("-----------------------------------")
return gjc
print("程序运行,正在导入关键词列表!!!")
print("-----------------------------------")
# 导入要搜索的关键词txt列表
urls = []
data = []
for line in open('.\gjc.txt', "r", encoding='utf-8'):
data.append(line)
print("导入关键词列表成功!")
print("-----------------------------------")
#转换关键词为搜索链接
for keyword in data:
url = 'https://www.baidu.com/s?wd=' + keyword
urls.append(url)
print("采集百度相关搜索关键词开启!")
print("...................")
#多线程获取相关关键词
try:
# 开4个 worker,没有参数时默认是 cpu 的核心数
pool = ThreadPool()
results = pool.map(xgss, urls)
pool.close()
pool.join()
print("采集百度相关搜索关键词完成,已保存于gjcsj.txt!")
except:
print("Error: unable to start thread")
print("8s后程序自动关闭!!!")
time.sleep(8)
exe下载地址:
链接: https://pan.baidu.com/s/1RhmZ99dYCSIJsEe-SnlhXQ
提取码: 9sjs