爬虫目的:爬取拉勾网上数据分析岗的数据,以便分析当前数据分析岗的需求现状。
爬虫实现的功能:根据城市名称、岗位名称关键字,即可爬取拉勾网上所需的数据信息。
爬虫的主要模块:
主函数与信息存储模块main.py
网页下载模块https.py
网页解析模块parse.py
IP代理池setting.py
# main.py
'''
拉钩网对于同一ip的大量请求行为肯定会进行封禁,所以需要准备代理池。
为了实现高自动化,需要对一系列可能出现的异常情况进行处理,断点处理,确保程序不挂。
为了提高效率,加入多线程。
数据持久化,在持久化之前需要先进行清洗。
'''
import https,parse,setting
# import https.Http , parse.Parse , setting.headers ,setting.cookies
import time
import logging
import codecs
logging.basicConfig(level=logging.ERROR,
format='%(asctime)s Process%(process)d:%(thread)d %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
filename='diary.log',
filemode='a')
def process(value):
# 处理字符串保存为csv文件时,因双引号、逗号引起的分隔问题。
if ("\"" in value):
# 若发现有双引号,将双引号替换为单引号。
value = value.replace("\"", "\'")
# value = value.replaceAll("\"", "\"\"")
# value = "\"" + value + "\""
if ("," in value or "," in value):
# 若发现有逗号 需给整体前后加双引号
value = "\"" + value + "\""
return value
def getInfo(url, para):
"""
获取信息
"""
generalHttp = https.Http()
htmlCode = generalHttp.post(url, para=para, headers=setting.headers, cookies=setting.cookies)
generalParse = parse.Parse(htmlCode)
pageCount = generalParse.parsePage()
info = []
for i in range(1, pageCount + 1):
print('第%s页' % i)
para['pn'] = str(i)
htmlCode = generalHttp.post(url, para=para, headers=setting.headers, cookies=setting.cookies)
generalParse = parse.Parse(htmlCode)
info = info + getInfoDetail(generalParse)
time.sleep(2)
return info
def getInfoDetail(generalParse):
"""
信息解析
"""
info = generalParse.parseInfo()
return info
def processInfo(info, para):
"""
信息存储
"""
logging.error('Process start')
try:
title = 'companyName,positionType,positionName,companyStage,companySize,Education,WorkYear,Salary,' \
'district,latitude,longitude,companyType,positionLables,positionAdvantage,companyLabel\n'
# "gbk"下csv文件不能对\t换单元格,只能换为“,”
file = codecs.open('%s%s职位.csv' %(para['city'],para['kd']), 'w', 'gbk')
# encoding = 'utf-8'时出现乱码。故改为"gbk",或使用‘utf-8’将文件在notepad++打开转回格式‘utf-8’后再打开。
file.write(title)
for p in info:
line = str(p['companyName']) + ',' + str(p['positionType']) + ',' + str(p['positionName']) + ',' + \
str(p['companyStage']) + ',' + str(p['companySize']) + ',' +str(p['positionEducation']) + ',' + \
str(p['positionWorkYear']) + ',' + str(p['positionSalary']) + ',' +str(p['district']) + ',' +\
str(p['latitude'])+ ',' +str(p['longitude'])+ ',' + str(p['companyType']) + ',' + \
process(str(p['positionLables']))+ ',' + process(str(p['positionAdvantage']))+ ',' + \
process(str(p['companyLabel'])) +'\n'
# "gbk"下csv文件不能对\t换单元格,只能换为“,”
file.write(line)
file.close()
return True
except Exception as e:
print(e)
return None
def main(url, para):
"""
主函数逻辑
"""
logging.error('Main start') # 日志生成
if url:
info = getInfo(url, para) # 获取信息
flag = processInfo(info, para) # 信息储存
return flag
else:
return None
if __name__ == '__main__':
kdList = [u'数据分析'] # keyword即搜索关键字
cityList = [u'上海']
url = 'https://www.lagou.com/jobs/positionAjax.json' # 如何确定的?
for keyword in kdList:
for city in cityList:
print('爬取%s' % city)
para = {'first': 'true', 'pn': '1', 'kd': keyword, 'city': city}
flag = main(url, para)
if flag:
print('%s爬取成功' % city)
else:
print('%s爬取失败' % city)
# https.py
import setting
import requests, random
import logging
logging.basicConfig(level=logging.ERROR,
format='%(asctime)s Process%(process)d:%(thread)d %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
filename='diary.log',
filemode='a')
class Http:
'''
http请求相关的操作
'''
def __init__(self):
pass
def get(self, url, headers=None, cookies=None, proxy=None, timeOut=5, timeOutRetry=5):
'''
获取网页源码
url: 网页链接
headers: headers
cookies: cookies
proxy: 代理
timeOut: 请求超时时间
timeOutRetry: 超时重试次数
return: 源码
'''
if not url:
logging.error('GetError url not exit')
return 'None'
logging.error('Get %s' % url)
try:
if not headers: headers = {'User-Agent': setting.UA[random.randint(0, len(setting.UA) - 1)]}
# if not proxy: proxy = {'http':"http://"+IP[random.randint(0, len(IP)-1)]}
response = requests.get(url, headers=headers, cookies=cookies, proxies=proxy, timeout=timeOut)
if response.status_code == 200 or response.status_code == 302:
htmlCode = response.text
else:
htmlCode = 'None'
logging.error('Get %s %s' % (str(response.status_code), url))
except Exception as e:
logging.error('GetExcept %s' % str(e))
if timeOutRetry > 0:
htmlCode = self.get(url=url, timeOutRetry=(timeOutRetry - 1))
else:
logging.error('GetTimeOut %s' % url)
htmlCode = 'None'
return htmlCode
def post(self, url, para, headers=None, cookies=None, proxy=None, timeOut=5, timeOutRetry=5):
'''
post获取响应
url: 目标链接
para: 参数
headers: headers
cookies: cookies
proxy: 代理
timeOut: 请求超时时间
timeOutRetry: 超时重试次数
return: 响应
'''
if not url or not para:
logging.error('PostError url or para not exit')
return None
logging.error('Post %s' % url)
try:
if not headers:
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3'}
response = requests.post(url, data=para, headers=headers, cookies=cookies, proxies=proxy, timeout=timeOut)
if response.status_code == 200 or response.status_code == 302:
htmlCode = response.text
else:
htmlCode = None
logging.error('Post %s %s' % (str(response.status_code), url))
except Exception as e:
logging.error('PostExcept %s' % str(e))
if timeOutRetry > 0:
htmlCode = self.post(url=url, para=para, timeOutRetry=(timeOutRetry - 1))
else:
logging.error('PostTimeOut %s' % url)
htmlCode = None
return htmlCode
def confirm(self, htmlCode, url, headers, cookies, proxy, catch_retry=5):
'''
反爬,验证页面
htmlCode:网页源码
return:网页源码
'''
# 获取网页title判断是否被ban
return htmlCode
def urlprocess(self, items):
# + URL 中+号表示空格 %2B
# 空格 URL中的空格可以用+号或者编码 %20
# / 分隔目录和子目录 %2F
# ? 分隔实际的URL和参数 %3F
# % 指定特殊字符 %25
# # 表示书签 %23
# & URL 中指定的参数间的分隔符 %26
# = URL 中指定参数的值 %3D
content = items.replace('/', '%2F').replace('=', '%3D').replace('+', '%2B').replace( \
' ', '%20').replace('/', '%2F').replace('?', '%3F').replace('=', '%3D')
return content
# parse.py
import re
import demjson
class Parse:
'''
解析网页信息
'''
def __init__(self, htmlCode):
self.htmlCode = htmlCode
self.json = demjson.decode(htmlCode)
pass
def parseTool(self, content):
'''
清除html标签
'''
if type(content) != str: return content
sublist = ['<p.*?>', '</p.*?>', '<b.*?>', '</b.*?>', '<div.*?>', '</div.*?>',
'</br>', '<br />', '<ul>', '</ul>', '<li>', '</li>', '<strong>',
'</strong>', '<table.*?>', '<tr.*?>', '</tr>', '<td.*?>', '</td>',
'\r', '\n', '&.*?;', '&', '#.*?;', '<em>', '</em>']
try:
for substring in [re.compile(string, re.S) for string in sublist]:
content = re.sub(substring, "", content).strip()
except:
raise Exception('Error ' + str(substring.pattern))
return content
def parsePage(self):
'''
解析并计算页面数量
return: 页面数量
'''
totalCount = self.json['content']['positionResult']['totalCount'] # 职位总数量
resultSize = self.json['content']['positionResult']['resultSize'] # 每一页显示的数量
pageCount = int(totalCount) // int(resultSize) + 1 # 页面数量
return pageCount
def parseInfo(self):
'''
解析信息
'''
info = []
for position in self.json['content']['positionResult']['result']:
i = {}
i['companyName'] = position['companyFullName']
i['positionType'] = position['firstType']
i['positionName'] = position['positionName']
i['companyStage'] = position['financeStage']
i['companySize'] = position['companySize']
i['positionEducation'] = position['education']
i['positionWorkYear'] = position['workYear']
i['positionSalary'] = position['salary']
i['district'] = position['district']
i['latitude'] = position['latitude']
i['longitude'] = position['longitude']
i['companyType'] = position['industryField']
i['positionLables'] = position['positionLables']
i['companyLabel'] = position['companyLabelList']
i['positionAdvantage'] = position['positionAdvantage']
info.append(i)
return info
# setting.py
headers = {
'Host': 'www.lagou.com',
'Connection': 'keep-alive',
'Content-Length': '23',
'Origin': 'https://www.lagou.com',
'X-Anit-Forge-Code': '0',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'X-Anit-Forge-Token': 'None',
'Referer': 'https://www.lagou.com/jobs/list_java?city=%E5%B9%BF%E5%B7%9E&cl=false&fromSearch=true&labelWords=&suginput=',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7'
}
# cookies
cookies = {
'user_trace_token': '20171011085044-36afc724-ae1e-11e7-947d-5254005c3644',
'LGUID': '20171011085044-36afc9e6-ae1e-11e7-947d-5254005c3644',
'_ga': 'GA1.2.1411877279.1507683044',
'index_location_city': '%E5%B9%BF%E5%B7%9E',
'JSESSIONID': 'ABAAABAAADEAAFI2466B2149D4B3E406932CAEA37FDF471',
'_gid': 'GA1.2.1604143331.1517585155',
'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1515000882,1515252738,1516984463,1517585156',
'LGSID': '20180202232556-5ce93c91-082d-11e8-abfa-5254005c3644', 'PRE_UTM': '',
'PRE_HOST': '',
'PRE_SITE': '',
'PRE_LAND': 'https%3A%2F%2Fwww.lagou.com%2F',
'TG-TRACK-CODE': 'index_navigation',
'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1517585322',
'LGRID': '20180202232842-c0095589-082d-11e8-abfa-5254005c3644',
'SEARCH_ID': '0a887843a48a49c7bb6dae915dabdcc1'
}
# IP池
# 0(pay) or 1(free) or 2(None)
TAGIP = 0
# IP
IP = []
# UA
UA = ['Mozilla/5.0 (Windows NT 5.1) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.5 Safari/534.55.3',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; TencentTraveler 4.0;\
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1))',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon/3.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; \
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; QIHU 360EE)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; 360SE)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; 360SE)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13',
'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \
SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 \
(KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) \
Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) \
Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; \
.NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) ',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; \
.NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) \
Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0) Gecko/20121026 Firefox/16.0',
'Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) \
Version/5.0.2 Mobile/8C148 Safari/6533.18.5',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)']