8684网站航班数据获取

原创

mb623d5db7b4a2f 2022-05-30 14:55:37 博主文章分类：python爬虫学习 ©著作权

©著作权归作者所有：来自51CTO博客作者mb623d5db7b4a2f的原创作品，请联系作者获取转载授权，否则将追究法律责任

文章目录

前言
一、首先分析网页
二、编写代码
总结

前言

看了这么多篇selenium，我们来一篇简单实用的。坐飞机是比较方便的一种出行方式了，价格一般情况下和高铁票也差距不大，甚至有时候会更便宜。这次我们就来通过出发地、目的地和出行时间这三个参数来获取航班的数据。来给大家看一看效果。
8684网站航班数据获取_python
嗯，北京到上海2021年5月10日的航班有110次，还是挺多的嘛。

编写代码

1.分析网页

通过抓包后我们发现我们所需要的数据是在一个内嵌的网页的网页里。
8684网站航班数据获取_xml_02
将这个页面的网址进行测试发现紧跟域名后面的就是出发地和目的地的拼音，然后后面的godate参数就是我们出发的日期。
8684网站航班数据获取_爬虫_03
然后我们来看看这个网址的请求参数。请求头没啥注意的，能多带一些就多带一些。params的话需要注意一下的就是godate为出发日期。cookies里的arrCityPy是出发地的拼音，depCityPy是目的地的拼音。
8684网站航班数据获取_html_04

2.编写代码

首先是获取网页的函数。

def chaxun(start,end,date):
    cookies = {
        'arrCityPy': start,
        'depCityPy': end,
    }

    headers = {
        'Connection': 'keep-alive',
        'Pragma': 'no-cache',
        'Cache-Control': 'no-cache',
        'sec-ch-ua': '^\\^',
        'sec-ch-ua-mobile': '?0',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-User': '?1',
        'Sec-Fetch-Dest': 'document',
        'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
    }

    params = (
        ('unionId', '427'),
        ('godate', date),
        ('searchType', '0'),
    )

    response = requests.get(f'https://jipiao.114piaowu.com/{start}-{end}.html', headers=headers, params=params, cookies=cookies)
    # print(response.text)
    return response.content.decode('utf-8')

然后是解析函数。

def jianxi(html):
    xp = etree.HTML(html)
    hangban_data = []
    hangban_list = xp.xpath('//*[@class="jp_list"]//*[@class="mainDiv66"]')
    for i in range(1,len(hangban_list)+1):
        start_time = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="aboutfj"]/li[1]/b/text()')[0]    #起飞时间
        start_time = ''.join(start_time.split())
        end_time = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="aboutfj"]/li[1]/p/text()')[0]  #到达时间
        end_time = ''.join(end_time.split())
        start_address = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="aboutfj"]/li[2]/p[1]/text()')[0]  #出发地点
        start_address = ''.join(start_address.split())
        end_address = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="aboutfj"]/li[2]/p[2]/text()')[0]    #抵达地点
        end_address = ''.join(end_address.split())
        hangban = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="aboutfj"]/li[3]/p[1]//text()')   #航班
        hangban = ''.join([''.join(hangban[0].split()),''.join(hangban[1].split())])
        # jijian_ranyou = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="aboutfj"]/li[4]/p[2]/text()')[0]   #机建加燃油
        # jijian_ranyou = ''.join(jijian_ranyou.split())
        jipiao = [] #机票
        jipiao_list = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="yd_list"]')
        for j in range(1,len(jipiao_list)+1):
            cangwei = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="yd_list"][{j}]/li[2]/b/text()')[0]  #舱位
            cangwei = ''.join(cangwei.split())
            price = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="yd_list"][{j}]/li[3]/b//text()')   #价格
            price = ''.join([''.join(price[0].split()), ''.join(price[1].split())])
            zhekou = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="yd_list"][{j}]/li[3]/em/text()')[0]  #折扣
            zhekou = ''.join(zhekou.split())
            jipiao.append({'舱位':cangwei,'价格':price,'折扣':zhekou})
        # hangban_data.append({'起飞时间':start_time,'到达时间':end_time,'出发地点':start_address,'抵达地点':end_address,'航班':hangban,'机建加燃油':jijian_ranyou,'机票':jipiao})
        hangban_data.append({'起飞时间': start_time, '到达时间': end_time, '出发地点': start_address, '抵达地点': end_address, '航班': hangban,'机票': jipiao})
    return hangban_data

3.总的代码

#coding:utf-8
import requests
from lxml import etree
import re
import time

def chaxun(start,end,date):
    cookies = {
        'arrCityPy': start,
        'depCityPy': end,
    }

    headers = {
        'Connection': 'keep-alive',
        'Pragma': 'no-cache',
        'Cache-Control': 'no-cache',
        'sec-ch-ua': '^\\^',
        'sec-ch-ua-mobile': '?0',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-User': '?1',
        'Sec-Fetch-Dest': 'document',
        'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
    }

    params = (
        ('unionId', '427'),
        ('godate', date),
        ('searchType', '0'),
    )

    response = requests.get(f'https://jipiao.114piaowu.com/{start}-{end}.html', headers=headers, params=params, cookies=cookies)
    # print(response.text)
    return response.content.decode('utf-8')

def jianxi(html):
    xp = etree.HTML(html)
    hangban_data = []
    hangban_list = xp.xpath('//*[@class="jp_list"]//*[@class="mainDiv66"]')
    for i in range(1,len(hangban_list)+1):
        start_time = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="aboutfj"]/li[1]/b/text()')[0]    #起飞时间
        start_time = ''.join(start_time.split())
        end_time = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="aboutfj"]/li[1]/p/text()')[0]  #到达时间
        end_time = ''.join(end_time.split())
        start_address = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="aboutfj"]/li[2]/p[1]/text()')[0]  #出发地点
        start_address = ''.join(start_address.split())
        end_address = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="aboutfj"]/li[2]/p[2]/text()')[0]    #抵达地点
        end_address = ''.join(end_address.split())
        hangban = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="aboutfj"]/li[3]/p[1]//text()')   #航班
        hangban = ''.join([''.join(hangban[0].split()),''.join(hangban[1].split())])
        # jijian_ranyou = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="aboutfj"]/li[4]/p[2]/text()')[0]   #机建加燃油
        # jijian_ranyou = ''.join(jijian_ranyou.split())
        jipiao = [] #机票
        jipiao_list = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="yd_list"]')
        for j in range(1,len(jipiao_list)+1):
            cangwei = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="yd_list"][{j}]/li[2]/b/text()')[0]  #舱位
            cangwei = ''.join(cangwei.split())
            price = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="yd_list"][{j}]/li[3]/b//text()')   #价格
            price = ''.join([''.join(price[0].split()), ''.join(price[1].split())])
            zhekou = xp.xpath(f'//*[@class="jp_list"]/div[{i}]//*[@class="yd_list"][{j}]/li[3]/em/text()')[0]  #折扣
            zhekou = ''.join(zhekou.split())
            jipiao.append({'舱位':cangwei,'价格':price,'折扣':zhekou})
        # hangban_data.append({'起飞时间':start_time,'到达时间':end_time,'出发地点':start_address,'抵达地点':end_address,'航班':hangban,'机建加燃油':jijian_ranyou,'机票':jipiao})
        hangban_data.append({'起飞时间': start_time, '到达时间': end_time, '出发地点': start_address, '抵达地点': end_address, '航班': hangban,'机票': jipiao})
    return hangban_data

if __name__ == '__main__':
    start_time = time.time()
    start1 = input('请输入出发地的拼音：')
    end1 = input('请输入目的地的拼音：')
    date = input('请输入出发日期：')
    while len(re.findall('\d{4}-\d{2}-\d{2}',date)) ==0:
        date = input('请输入出发日期：')
    print('正在查询')
    html = chaxun(start1,end1,date)
    datas = jianxi(html)
    print(len(datas))
    if len(datas) != 0:
        for data in datas:
            print(data)
    else:
        print('未查询到相关数据')
    end_time = time.time()
    print(f'运行了{end_time - start_time}秒')