python爬虫json爬网页 python爬动态网页json

转载

attitude 2023-08-07 19:56:20

文章标签 python爬虫json爬网页 Python爬虫动态数据加载 IP代理 JSON 文章分类 Python 后端开发

之前笔者做的爬虫基本都是获取网页代码后就可直接获取数据，或者用selenium进行模拟用户。但是用selenium的速度是真心慢。这次笔者在爬取VenusEye威胁情报中心的时候，获取代码后发现所需数据都是JS动态加载的数据。结果如下：

<dl @click="search('domain')" v-show="headerEmail">
    <dt>{{langMap['域名'][config.locale]}}：</dt>
    <dd>{{headerkeyword.replace(/^(http|https|ftp)\:\/\//,'')}}</dd>
</dl>
<dl @click="search('url')" v-show="headerEmail">
    <dt>URL：</dt>
    <dd>{{headerkeyword}}</dd>
</dl>
<dl @click="search('haxi')" v-show="headerHash">
    <dt>{{langMap['哈希'][config.locale]}}：</dt>
    <dd>{{headerkeyword}}</dd>
</dl>
<dl @click="search('ip')" v-show="headerIp">
    <dt>IP：</dt>
    <dd>{{headerkeyword}}</dd>
</dl>
<dl @click="search('email')">
    <dt>{{langMap['邮箱'][config.locale]}}：</dt>
    <dd>{{headerkeyword}}</dd>
</dl>

这个时候有两种办法可以解决。第一种是用selenium进行爬取，因为selenium相当于所见几所得嘛。但是当你爬取的数据量过大时选择selenium显然不适合。第二种方法就是笔者使用的——进行网页分析。

首先打开所需爬取的页面，按F12——Network——点击下方的XHR按钮，然后F5刷新页面后可以看到JS请求。

python爬虫json爬网页 python爬动态网页json_动态数据加载

点击其中一个文件，此处选择ip文件。查看返回的JSON数据，发现正是我们想要的。

python爬虫json爬网页 python爬动态网页json_python爬虫json爬网页_02

接下来要做的就是直接在Python中进行JSON请求，获取数据。点击该文件的Header，General中的Request URL就是请求的URL，Form Data中的target则是进行POST请求时上传的数据。因为General中Request Method显示了是POST请求，若是Get请求，则不用data，直接在URL后面上传数据。

python爬虫json爬网页 python爬动态网页json_JSON_03

python爬虫json爬网页 python爬动态网页json_Python爬虫_04

确定了请求的URL和data，接下来就可以进行爬虫了。

首先是初始化数据，包括导入包、headers、存数据的表头等：

import requests, xlwt, time, random

#初始化
def init():
    global url_1, url_2, headers, workbook, table, row_now
    url = 'https://top.chinaz.com/all/'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
#         'X-Forwarded-For' : '9.9.9.9',
        'Forwarded': '9.9.9.9'
    }
    url_1 = 'https://www.venuseye.com.cn/ve/ip'
    url_2 = 'https://www.venuseye.com.cn/ve/ip/ioc'
    workbook = xlwt.Workbook(encoding='utf-8')
    table=workbook.add_sheet("name",cell_overwrite_ok=True)
    value=[
        "ip", "location", "as", "updata_time", "tags", "ports", "threat_score", 
        "ioc_code", "ioc_updata_time", "ioc_categories", "ioc_families", "ioc_organizations"
    ]
    for i in range(len(value)):
        table.write(0,i,value[i])
    row_now = 1

然后定义一个函数，用来获取传入IP的数据，也就是代码的核心部分。这里笔者进行了很多数据判断，是为了确保程序在运行时有较好的兼容性：

#获取当前IP数据
def get_ip_data(ip_now,pro):
    data = {'target':ip_now}
    result_1 = requests.post(url_1, headers=headers, data=data, proxies=pro).json()
    global row_now
    if result_1['status_code'] == 200:
        if 'ip' in result_1['data']:
            table.write(row_now,0,result_1['data']['ip'])
        else:
            return False
        location = '' 
        if result_1['data']['cy'] != '':
            location = result_1['data']['cy']
        if result_1['data']['provincial'] != '':
            location = location + ',' + result_1['data']['provincial']
        if result_1['data']['area'] != '':
            location = location + ',' + result_1['data']['area']
        if result_1['data']['ompany'] != '':
            location = location + ',' + result_1['data']['ompany']
        if result_1['data']['operator'] != '':
            location = location + '(' + result_1['data']['operator']+ ')'
        table.write(row_now,1,location)
        as_data = ''
        if 'asn' in result_1['data']:
            as_data = result_1['data']['asn']
        if 'aso' in result_1['data']:
            if result_1['data']['aso'] != '':
                as_data = str(as_data) + '(' + result_1['data']['aso']+ ')'
        table.write(row_now,2,as_data)
        if 'active_time' in result_1['data']:
            timeArray = time.localtime(result_1['data']['active_time'])
            table.write(row_now,3,time.strftime("%Y-%m-%d", timeArray))
        tags = ''
        if 'tags' in result_1['data']:
            if len(result_1['data']['tags']) > 0:
                for now_data in result_1['data']['tags']:
                    tags = tags + now_data + ';'
            else:
                tags = result_1['data']['tags']
            table.write(row_now,4,tags)
        ports = ''
        if 'ports' in result_1['data']:
            if len(result_1['data']['ports']) > 0:
                for now_data in result_1['data']['ports']:
                    ports = ports + now_data + ';'
            else:
                ports = result_1['data']['ports']
            table.write(row_now,5,ports)
        if 'threat_score' in result_1['data']:
            table.write(row_now,6,str(result_1['data']['threat_score']))
    else:
        return False
    result_2 = requests.post(url_2, headers=headers, data=data, proxies=pro).json()
    if result_2['status_code'] == 200:
        if 'code' in result_2['data']['ioc'][0]:
            table.write(row_now,7,result_2['data']['ioc'][0]['code'])
        if 'update_time' in result_2['data']['ioc'][0]:
            timeArray = time.localtime(result_2['data']['ioc'][0]['update_time'])
            table.write(row_now,8,time.strftime("%Y-%m-%d", timeArray))
        if 'categories' in result_2['data']['ioc'][0]:
            categories = ''
            if len(result_2['data']['ioc'][0]['categories']) > 0:
                for now_data in result_2['data']['ioc'][0]['categories']:
                    categories = categories + now_data + ';'
            else:
                categories = result_2['data']['ioc'][0]['categories']
            table.write(row_now,9,categories)
        if 'families' in result_2['data']['ioc'][0]:
            families = ''
            if len(result_2['data']['ioc'][0]['families']) > 0:
                for now_data in result_2['data']['ioc'][0]['families']:
                    families = families + now_data + ';'
            else:
                families = result_2['data']['ioc'][0]['families']
            table.write(row_now,10,families)
        if 'organizations' in result_2['data']['ioc'][0]:
            organizations = ''
            if len(result_2['data']['ioc'][0]['organizations']) > 0:
                for now_data in result_2['data']['ioc'][0]['organizations']:
                    organizations = organizations + now_data + ';'
            else:
                organizations = result_2['data']['ioc'][0]['organizations']
            table.write(row_now,11,organizations)
    row_now = row_now + 1    
    return True

因为爬取的目标网页有反爬虫措施，当一个IP请求次数过多后会拒绝服务。此处笔者进行了IP代理处理，由于使用的代理IP是付费的，所以请求的URL中笔者就用***进行隐藏：

#获取代理IP
def get_new_ip():
    pro = ''
    while True:
        try:
            ip_json = result_1 = requests.post('***', headers=headers).json()#请求API获取JSON数据
            if ip_json['code'] == 0 and ip_json['success'] == 'true':
                pro = {
                    'http':ip_json['data'][0]['IP'],# + ip_json['data'][0]['Port']
                    'https':ip_json['data'][0]['IP']# + ip_json['data'][0]['Port']
                }
                web_data = requests.get('http://httpbin.org/get', headers=headers,proxies=pro)
                print(web_data.text)
                break 
            elif ip_json['code'] == 10000:
                print('10000')
                time.sleep(5)
        except:
            print('IP get try agin')
            time.sleep(5)
    return pro

最后就是主函数了。主要是IP循环处理、程序异常处理和数据保存：

#主函数
if __name__ == '__main__':
    init()
    pro = get_new_ip()
    for line in open("./venuseye_ip爬虫/test.txt"):
        print(line.replace('\n',''),end='\t')
        while True:
            try:
                if get_ip_data(line,pro) == True:
                    print('end', end = '\t')
                    break
                else:
                    time.sleep(3)
                    print('打开网页失败', end = '\t')
                    pro = get_new_ip()
            except:
                    print('无响应', end = '\t')
                    pro = get_new_ip()
    workbook.save('./venuseye_ip爬虫/test.xls')

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。