我们做用户分析时离不开用户来源分析,在很多网站/交易平台,我们的用户在没有注册或产生消费时是没有用户自行上传的来源信息的,所以我们程序猿或者数据分析都会根据用户访问的IP地址来做进一步划分判断,这样便能统计到观望我们的用户分布情况;甚至有些黑客恶意利用国外IP访问恶意攻击我们平台,当我们确认IP来源后就可以事先屏蔽一些恶意攻击行为。
接口
urls = r"https://open.onebox.so.com/dataApi?callback=jQuery18309089439851148142_1533546946966&type=ip&src=onebox&tpl=0&num=1&query=ip&ip=" + str(ips) + "&url=ip" + str(int(time.time() * 1000))
案例数据源
python解析代码
# coding: utf-8
# -*- coding: utf-8 -*-
import urllib.request
import urllib.parse
from urllib.error import HTTPError ,URLError
import socket #请求超时异常
from bs4 import BeautifulSoup
import requests
import time
import json
import csv
import re
import random
import datetime
import pymysql
def response(url):
try:
headers = { }
headers["User-Agent"]="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
#random.seed(datetime.datetime.now())
req = urllib.request.Request(url, headers=headers)
bsobj = urllib.request.urlopen(req, timeout=60).read().decode("utf-8")
bsobj = str(BeautifulSoup(bsobj,"html.parser"))
#print(bsobj)
bsobjs = re.findall(r'\{.*\}', bsobj)
bsobjs = " ".join(bsobjs)
#print(bsobjs)
jsonobj = json.loads(bsobjs)
#print(jsonobj)
country = jsonobj.get("0")
province = jsonobj.get("1")
if (province)==None:
province = country
else:
province = province
city = jsonobj.get("2")
if (city)==None and province != None:
city = province
elif (city)==None and province == None:
city = country
else:
city = city
classification = jsonobj.get("5")
ip_id = jsonobj.get("ip")
Local = jsonobj.get("4")
print(ip_id, country ,province ,city,classification,Local)
writer.writerows([[ip_id, country ,province ,city,classification,Local]])
except (HTTPError, URLError, socket.timeout, AttributeError,UnicodeEncodeError) as e:
return
if __name__ == '__main__':
#创建存储表
file_name = r"D:\Case_data/360IP归属地查询" + ".csv"
f = open(file_name, "w+", newline='',encoding = 'gb18030')
writer = csv.writer(f, dialect='excel')
# 先写入columns_name
writer.writerow(['ip_id', 'country' ,'province' ,'city','classification','Local'])
# 获取ip数据
with open(r'D:\Case_data/ips.csv','rt',encoding='gb18030') as csvfile:
reader = csv.reader(csvfile)
column = [row[3] for row in reader]
#print (column)
for ips in column:
#print(url)
rand = random.randint(2, 5)
time.sleep(rand)#延时提交
urls = r"https://open.onebox.so.com/dataApi?callback=jQuery18309089439851148142_1533546946966&type=ip&src=onebox&tpl=0&num=1&query=ip&ip=" + str(ips) + "&url=ip" + str(int(time.time() * 1000))
#print(urls)
response(urls)
f.close()
分类汇总
## 按照城市聚合并倒序
import numpy as np
import pandas as pd
df = pd.read_csv(r"D:\Case_data/360IP归属地查询.csv",engine='python')
display(df.head(5))
gd = df.groupby(["province"]).agg({"province":["count"]})
gd.columns=["数量"]
gd = gd.sort_values(by=["数量"],ascending=False) # 根据某些列进行排序
gd["占比"]=(((gd["数量"]/df["province"].count())*100).round(2).astype("str"))+"%"
display(gd.head(5))