爬虫目的:爬取拉勾网上数据分析岗的数据,以便分析当前数据分析岗的需求现状。

爬虫实现的功能:根据城市名称、岗位名称关键字,即可爬取拉勾网上所需的数据信息。

爬虫的主要模块:

  主函数与信息存储模块main.py

  网页下载模块https.py

  网页解析模块parse.py

  IP代理池setting.py

# main.py
'''
拉钩网对于同一ip的大量请求行为肯定会进行封禁,所以需要准备代理池。
为了实现高自动化,需要对一系列可能出现的异常情况进行处理,断点处理,确保程序不挂。
为了提高效率,加入多线程。
数据持久化,在持久化之前需要先进行清洗。
'''
import https,parse,setting
# import https.Http , parse.Parse , setting.headers ,setting.cookies

import time
import logging
import codecs

logging.basicConfig(level=logging.ERROR,
                    format='%(asctime)s Process%(process)d:%(thread)d %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    filename='diary.log',
                    filemode='a')

def process(value):
    # 处理字符串保存为csv文件时,因双引号、逗号引起的分隔问题。

    if ("\"" in value):
    # 若发现有双引号,将双引号替换为单引号。

        value = value.replace("\"", "\'")
        # value = value.replaceAll("\"", "\"\"")
        # value = "\"" + value + "\""

    if ("," in value or "," in value):
        # 若发现有逗号  需给整体前后加双引号
        value = "\"" + value + "\""

    return value

def getInfo(url, para):
    """
    获取信息
    """
    generalHttp = https.Http()
    htmlCode = generalHttp.post(url, para=para, headers=setting.headers, cookies=setting.cookies)
    generalParse = parse.Parse(htmlCode)
    pageCount = generalParse.parsePage()
    info = []
    for i in range(1, pageCount + 1):
        print('第%s页' % i)
        para['pn'] = str(i)
        htmlCode = generalHttp.post(url, para=para, headers=setting.headers, cookies=setting.cookies)
        generalParse = parse.Parse(htmlCode)
        info = info + getInfoDetail(generalParse)
        time.sleep(2)
    return info


def getInfoDetail(generalParse):
    """
    信息解析
    """
    info = generalParse.parseInfo()
    return info


def processInfo(info, para):
    """
    信息存储
    """
    logging.error('Process start')
    try:
        title = 'companyName,positionType,positionName,companyStage,companySize,Education,WorkYear,Salary,' \
                'district,latitude,longitude,companyType,positionLables,positionAdvantage,companyLabel\n'
        # "gbk"下csv文件不能对\t换单元格,只能换为“,”

        file = codecs.open('%s%s职位.csv' %(para['city'],para['kd']), 'w', 'gbk')
        # encoding = 'utf-8'时出现乱码。故改为"gbk",或使用‘utf-8’将文件在notepad++打开转回格式‘utf-8’后再打开。

        file.write(title)
        for p in info:
            line = str(p['companyName']) + ',' + str(p['positionType']) + ',' + str(p['positionName']) + ',' + \
                   str(p['companyStage']) + ',' + str(p['companySize']) + ',' +str(p['positionEducation']) + ',' + \
                   str(p['positionWorkYear']) + ',' + str(p['positionSalary']) + ',' +str(p['district']) + ',' +\
                   str(p['latitude'])+ ',' +str(p['longitude'])+ ',' + str(p['companyType']) + ',' + \
                   process(str(p['positionLables']))+ ',' + process(str(p['positionAdvantage']))+ ',' + \
                   process(str(p['companyLabel'])) +'\n'
            # "gbk"下csv文件不能对\t换单元格,只能换为“,”
            file.write(line)
        file.close()
        return True
    except Exception as e:
        print(e)
        return None


def main(url, para):
    """
    主函数逻辑
    """
    logging.error('Main start') # 日志生成
    if url:
        info = getInfo(url, para)  # 获取信息
        flag = processInfo(info, para)  # 信息储存
        return flag
    else:
        return None


if __name__ == '__main__':
    kdList = [u'数据分析']  # keyword即搜索关键字
    cityList = [u'上海']
    url = 'https://www.lagou.com/jobs/positionAjax.json'    # 如何确定的?
    for keyword in kdList:
        for city in cityList:
            print('爬取%s' % city)
            para = {'first': 'true', 'pn': '1', 'kd': keyword, 'city': city}
            flag = main(url, para)
            if flag:
                print('%s爬取成功' % city)
            else:
                print('%s爬取失败' % city)
# https.py
import setting
import requests, random
import logging

logging.basicConfig(level=logging.ERROR,
                    format='%(asctime)s Process%(process)d:%(thread)d %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    filename='diary.log',
                    filemode='a')

class Http:
    '''
    http请求相关的操作
    '''

    def __init__(self):
        pass

    def get(self, url, headers=None, cookies=None, proxy=None, timeOut=5, timeOutRetry=5):
        '''
        获取网页源码
        url: 网页链接
        headers: headers
        cookies: cookies
        proxy: 代理
        timeOut: 请求超时时间
        timeOutRetry: 超时重试次数
        return: 源码
        '''
        if not url:
            logging.error('GetError url not exit')
            return 'None'
        logging.error('Get %s' % url)
        try:
            if not headers: headers = {'User-Agent': setting.UA[random.randint(0, len(setting.UA) - 1)]}
            # if not proxy: proxy = {'http':"http://"+IP[random.randint(0, len(IP)-1)]}
            response = requests.get(url, headers=headers, cookies=cookies, proxies=proxy, timeout=timeOut)
            if response.status_code == 200 or response.status_code == 302:
                htmlCode = response.text
            else:
                htmlCode = 'None'
            logging.error('Get %s %s' % (str(response.status_code), url))
        except Exception as e:
            logging.error('GetExcept %s' % str(e))
            if timeOutRetry > 0:
                htmlCode = self.get(url=url, timeOutRetry=(timeOutRetry - 1))
            else:
                logging.error('GetTimeOut %s' % url)
                htmlCode = 'None'
        return htmlCode

    def post(self, url, para, headers=None, cookies=None, proxy=None, timeOut=5, timeOutRetry=5):
        '''
        post获取响应
        url: 目标链接
        para: 参数
        headers: headers
        cookies: cookies
        proxy: 代理
        timeOut: 请求超时时间
        timeOutRetry: 超时重试次数
        return: 响应
        '''
        if not url or not para:
            logging.error('PostError url or para not exit')
            return None
        logging.error('Post %s' % url)
        try:
            if not headers:
                headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3'}
            response = requests.post(url, data=para, headers=headers, cookies=cookies, proxies=proxy, timeout=timeOut)
            if response.status_code == 200 or response.status_code == 302:
                htmlCode = response.text
            else:
                htmlCode = None
            logging.error('Post %s %s' % (str(response.status_code), url))
        except Exception as e:
            logging.error('PostExcept %s' % str(e))
            if timeOutRetry > 0:
                htmlCode = self.post(url=url, para=para, timeOutRetry=(timeOutRetry - 1))
            else:
                logging.error('PostTimeOut %s' % url)
                htmlCode = None
        return htmlCode

    def confirm(self, htmlCode, url, headers, cookies, proxy, catch_retry=5):
        '''
        反爬,验证页面
        htmlCode:网页源码
        return:网页源码
        '''
        # 获取网页title判断是否被ban
        return htmlCode

    def urlprocess(self, items):
        # +    URL 中+号表示空格               %2B
        # 空格 URL中的空格可以用+号或者编码    %20
        # /    分隔目录和子目录                %2F
        # ?    分隔实际的URL和参数             %3F
        # %    指定特殊字符                    %25
        # #    表示书签                        %23
        # &    URL 中指定的参数间的分隔符      %26
        # =    URL 中指定参数的值              %3D
        content = items.replace('/', '%2F').replace('=', '%3D').replace('+', '%2B').replace( \
            ' ', '%20').replace('/', '%2F').replace('?', '%3F').replace('=', '%3D')
        return content
# parse.py
import re
import demjson

class Parse:
    '''
    解析网页信息
    '''

    def __init__(self, htmlCode):
        self.htmlCode = htmlCode
        self.json = demjson.decode(htmlCode)
        pass

    def parseTool(self, content):
        '''
        清除html标签
        '''
        if type(content) != str: return content
        sublist = ['<p.*?>', '</p.*?>', '<b.*?>', '</b.*?>', '<div.*?>', '</div.*?>',
                   '</br>', '<br />', '<ul>', '</ul>', '<li>', '</li>', '<strong>',
                   '</strong>', '<table.*?>', '<tr.*?>', '</tr>', '<td.*?>', '</td>',
                   '\r', '\n', '&.*?;', '&', '#.*?;', '<em>', '</em>']
        try:
            for substring in [re.compile(string, re.S) for string in sublist]:
                content = re.sub(substring, "", content).strip()
        except:
            raise Exception('Error ' + str(substring.pattern))
        return content

    def parsePage(self):
        '''
        解析并计算页面数量
        return: 页面数量
        '''
        totalCount = self.json['content']['positionResult']['totalCount']  # 职位总数量
        resultSize = self.json['content']['positionResult']['resultSize']  # 每一页显示的数量
        pageCount = int(totalCount) // int(resultSize) + 1  # 页面数量
        return pageCount

    def parseInfo(self):
        '''
        解析信息
        '''
        info = []
        for position in self.json['content']['positionResult']['result']:
            i = {}
            i['companyName'] = position['companyFullName']
            i['positionType'] = position['firstType']
            i['positionName'] = position['positionName']
            i['companyStage'] = position['financeStage']
            i['companySize'] = position['companySize']
            i['positionEducation'] = position['education']
            i['positionWorkYear'] = position['workYear']
            i['positionSalary'] = position['salary']
            i['district'] = position['district']
            i['latitude'] = position['latitude']
            i['longitude'] = position['longitude']
            i['companyType'] = position['industryField']

            i['positionLables'] = position['positionLables']
            i['companyLabel'] = position['companyLabelList']
            i['positionAdvantage'] = position['positionAdvantage']
            info.append(i)
        return info
# setting.py

headers = {
    'Host': 'www.lagou.com',
    'Connection': 'keep-alive',
    'Content-Length': '23',
    'Origin': 'https://www.lagou.com',
    'X-Anit-Forge-Code': '0',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'X-Requested-With': 'XMLHttpRequest',
    'X-Anit-Forge-Token': 'None',
    'Referer': 'https://www.lagou.com/jobs/list_java?city=%E5%B9%BF%E5%B7%9E&cl=false&fromSearch=true&labelWords=&suginput=',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7'
}

# cookies
cookies = {
    'user_trace_token': '20171011085044-36afc724-ae1e-11e7-947d-5254005c3644',
    'LGUID': '20171011085044-36afc9e6-ae1e-11e7-947d-5254005c3644',
    '_ga': 'GA1.2.1411877279.1507683044',
    'index_location_city': '%E5%B9%BF%E5%B7%9E',
    'JSESSIONID': 'ABAAABAAADEAAFI2466B2149D4B3E406932CAEA37FDF471',
    '_gid': 'GA1.2.1604143331.1517585155',
    'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1515000882,1515252738,1516984463,1517585156',
    'LGSID': '20180202232556-5ce93c91-082d-11e8-abfa-5254005c3644', 'PRE_UTM': '',
    'PRE_HOST': '',
    'PRE_SITE': '',
    'PRE_LAND': 'https%3A%2F%2Fwww.lagou.com%2F',
    'TG-TRACK-CODE': 'index_navigation',
    'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1517585322',
    'LGRID': '20180202232842-c0095589-082d-11e8-abfa-5254005c3644',
    'SEARCH_ID': '0a887843a48a49c7bb6dae915dabdcc1'
}

# IP池
# 0(pay) or 1(free) or 2(None)
TAGIP = 0

# IP
IP = []

# UA
UA = ['Mozilla/5.0 (Windows NT 5.1) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.5 Safari/534.55.3',
      'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; TencentTraveler 4.0;\
       Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1))',

      'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \
      Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon/3.0)',

      'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; \
      Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ;  QIHU 360EE)',

      'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \
      Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; 360SE)',

      'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; 360SE)',
      'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
      'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
      'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13',
      'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
      'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
      'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
      'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1',
      'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)',

      'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \
      SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',

      'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)',
      'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',

      'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 \
      (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',

      'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
      'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',

      'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) \
      Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',

      'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) \
      Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',

      'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; \
      .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) ',

      'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; \
      .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',

      'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)',
      'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) ',
      'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1',

      'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) \
      Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',

      'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0) Gecko/20121026 Firefox/16.0',

      'Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) \
      Version/5.0.2 Mobile/8C148 Safari/6533.18.5',

      'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre',
      'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
      'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)']