我们做用户分析时离不开用户来源分析,在很多网站/交易平台,我们的用户在没有注册或产生消费时是没有用户自行上传的来源信息的,所以我们程序猿或者数据分析都会根据用户访问的IP地址来做进一步划分判断,这样便能统计到观望我们的用户分布情况;甚至有些黑客恶意利用国外IP访问恶意攻击我们平台,当我们确认IP来源后就可以事先屏蔽一些恶意攻击行为。

接口

urls = r"https://open.onebox.so.com/dataApi?callback=jQuery18309089439851148142_1533546946966&type=ip&src=onebox&tpl=0&num=1&query=ip&ip=" + str(ips) + "&url=ip" + str(int(time.time() * 1000))

python批量获取IP地址来源并做分类汇总_IP

python批量获取IP地址来源并做分类汇总_IP_02

案例数据源

python批量获取IP地址来源并做分类汇总_json_03

python解析代码

# coding: utf-8
# -*- coding: utf-8 -*-
import urllib.request
import urllib.parse
from urllib.error import  HTTPError ,URLError
import socket  #请求超时异常
from bs4 import BeautifulSoup
import requests
import time
import json
import csv
import re
import random
import datetime
import pymysql


def response(url):
    try:
        headers = { }
        headers["User-Agent"]="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
        #random.seed(datetime.datetime.now())
        req = urllib.request.Request(url, headers=headers)
        bsobj = urllib.request.urlopen(req, timeout=60).read().decode("utf-8")
        bsobj = str(BeautifulSoup(bsobj,"html.parser"))
        #print(bsobj)
        bsobjs = re.findall(r'\{.*\}', bsobj)
        bsobjs = " ".join(bsobjs)
        #print(bsobjs)
        jsonobj = json.loads(bsobjs)
        #print(jsonobj)
        country = jsonobj.get("0")
        province = jsonobj.get("1")
        if (province)==None:
            province = country
        else:
            province = province
        city = jsonobj.get("2")
        if (city)==None and province != None:
            city = province
        elif (city)==None and province == None:
            city = country
        else:
            city = city
        classification = jsonobj.get("5")
        ip_id = jsonobj.get("ip")
        Local = jsonobj.get("4")
        print(ip_id, country ,province ,city,classification,Local)
        writer.writerows([[ip_id, country ,province ,city,classification,Local]])

    except (HTTPError, URLError, socket.timeout, AttributeError,UnicodeEncodeError) as e:
        return
if __name__ == '__main__':
    #创建存储表
    file_name = r"D:\Case_data/360IP归属地查询" + ".csv"
    f = open(file_name, "w+", newline='',encoding = 'gb18030')
    writer = csv.writer(f, dialect='excel')
    # 先写入columns_name
    writer.writerow(['ip_id', 'country' ,'province' ,'city','classification','Local'])
    
    # 获取ip数据
    with open(r'D:\Case_data/ips.csv','rt',encoding='gb18030') as csvfile:
        reader = csv.reader(csvfile)
        column = [row[3] for row in reader]
        #print (column)
        for ips in column:
            #print(url)
            rand = random.randint(2, 5)
            time.sleep(rand)#延时提交
            urls = r"https://open.onebox.so.com/dataApi?callback=jQuery18309089439851148142_1533546946966&type=ip&src=onebox&tpl=0&num=1&query=ip&ip=" + str(ips) + "&url=ip" + str(int(time.time() * 1000))
            #print(urls)
            response(urls)
f.close()

python批量获取IP地址来源并做分类汇总_IP_04

分类汇总

## 按照城市聚合并倒序
import numpy as np
import pandas as pd 

df = pd.read_csv(r"D:\Case_data/360IP归属地查询.csv",engine='python')
display(df.head(5))
gd = df.groupby(["province"]).agg({"province":["count"]})
gd.columns=["数量"]  
gd = gd.sort_values(by=["数量"],ascending=False) # 根据某些列进行排序
gd["占比"]=(((gd["数量"]/df["province"].count())*100).round(2).astype("str"))+"%"
display(gd.head(5))

python批量获取IP地址来源并做分类汇总_IP_05