一、前言
数据集包含了2017年11月25日至2017年12月3日之间,有行为的约一百万随机用户的所有行为(行为包括点击、购买、加购、喜欢)。数据集的每一行表示一条用户行为,由用户ID、商品ID、商品类目ID、行为类型和时间戳组成,并以逗号分隔。
二、数据结构
用户行为表ub结构如下:
列英文名 | 列中文名 | 说明 |
user_id | 用户ID | 整数类型,序列化后的用户ID |
item_id | 商品ID | 整数类型,序列化后的商品ID |
cate_id | 商品类型ID | 整数类型,序列化后的商品所属类目ID |
type | 行为类型 | 字符串,枚举类型,包括(‘pv’, ‘buy’, ‘cart’, ‘fav’) |
time | 时间戳 | 行为发生的时间戳 |
源数据中的时间是时间戳,这里将时间戳进行转换,并且增加日期和对应时段用于后续分析。样例数据如下:
user_id | item_id | cate_id | type | time | ts | date |
1 | 2268318 | 2520377 | pv | 1511544070 | 2017-11-25 01:21:10 | 2017-11-25 00:00:00 |
三、数据分析
1 用户访问行为模型
1.1 活跃用户
活跃用户:不计较使用的状况下,频繁打开应用的用户称为活跃用户。
1.1.1 日访问量( PV )
SELECT date,
COUNT(1) PV
FROM ub
GROUP BY date
ORDER BY date ASC
1.1.2 日活跃用户数( UV )
SELECT date,
COUNT(DISTINCT user_id) uv
FROM ub
GROUP BY date
ORDER BY date ASC
1.1.3 连续 n 天访问的用户数(n=1,2,3,…)
注意把 SQL 里的 n 替换为具体的天数
-- 方法1。思路:得到访问日期date前的最近日期newdate,如果两个日期相差n-1天,则为连续。
SELECT COUNT(DISTINCT user_id) AS "连续 n 天访问的用户"
FROM (
SELECT user_id, dt, LEAD(date,n-1,null)OVER(PARTITION BY user_id ORDER BY date ASC) AS newdt
FROM (
SELECT user_id,date
FROM ub
WHERE type='pv'
GROUP BY user_id,date
) a
) b
WHERE DATEDIFF(newdate,date)=n-1
-- 方法2。通过对user_id分组按日期排序后(rn),用访问日期减去rn得到flag_dt,如果连续的话,则flag_dt会相同。
SELECT COUNT(DISTINCT user_id) AS "连续 n 天访问的用户"
FROM (
SELECT user_id
FROM (
SELECT user_id, date, ROW_RUMBER()OVER(PARTITION BY user_id ORDER BY date ASC) AS rn
FROM ub
WHERE type='pv'
) a
GROUP BY user_id,
DATE_SUB(date,rn)
HAVING COUNT(1)>=n
) b
-- 方法3。使用自关联,n值比较小可以使用,如n=3
select count(*) AS "连续 n 天访问的用户"
from (
select t3.user_id
from ub t1
left join ub t2
on t1.user_id = t2.user_id
and t1.date = t2.date + 1
left join ub t3
on t2.user_id = t3.user_id
and t2.date = t3.date + 1
where type='pv'
group by t3.user_id
) t
where t.user_id is not null
1.1.4 连续多天访问的用户,每个用户连续天数不一样
-- 思路:首先利用窗口函数以user_id分组然后按照date排序给出每个date在排序中的位置,然后用求出date与位置的差(记为num)最后按照user_id和num做一个聚合,容易发现同一个num组内的date是连续的值,然后直接计数(count(*))就可以得出结果了。
SELECT user_id,
MAX(dt) - MIN(dt) diff,
COLLECT_SET(dt) dt_array
FROM (
SELECT a.user_id,
a.dt,
dt - rn num
FROM (
SELECT user_id,to_date(date,'yyyymmdd') dt,row_number () over (PARTITION BY user_id ORDER BY date) rn
FROM ub
WHERE type='pv'
GROUP BY user_id,dt
) a
) a
GROUP BY user_id,
num
输出结果如下:
USER_ID DIFF DT_ARRAY
1043736 3.0 {20140815 20140814 20140813 20140812}
1043736 0.0 {20140818}
1043736 1.0 {20140821 20140820}
1043844 0.0 {20140814}
1044090 1.0 {20140812 20140811}
1044090 2.0 {20140816 20140815 20140817}
1044090 0.0 {20140821}
1044264 0.0 {20140810}
1044264 3.0 {20140815 20140814 20140813 20140812}
1044264 5.0 {20140821 20140820 20140822 20140819 20140817 20140818}
结果中user_id = 1043736 的一共访问了7天,其中可以拆分成三个连续的访问模块,分别是连续访问1天、2天和4天
1.1.5 用户首个访问和第二次访问的日期间隔
SELECT user_id,
time,
next_time,
DATEDIFF(next_time,time) diff
FROM (
SELECT user_id,
time,
next_time,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY time ASC) AS rank
FROM (
SELECT user_id,
time,
LEAD(time, 1, NULL) OVER (PARTITION BY user_id ORDER BY time asc) AS next_time
FROM ub
WHERE type='pv'
) a
WHERE next_time IS NOT NULL
)b
WHERE rank=1
1.1.6 超级微笑曲线:30天内独立访问用户数
SELECT visit_days AS "访问天数",
COUNT(user_id) AS "独立用户数"
FROM (
SELECT user_id,
COUNT(DISTINCT date) AS visit_days
FROM ub
WHERE type='pv'
WHERE date BETWEEN CURRENT_DATE() - INTERVAL '30' DAY AND CURRENT_DATE() - INTERVAL '1' DAY
GROUP BY 1
) a
GROUP BY 1
1.2 留存用户
留存用户:某段时间内的新增用户,经过一段时间后,又继续使用应用的被认为是留存用户。
日留存数:前一天新增且今天活跃的用户数。
日留存率:留存用户数除以前一天活跃用户数。
1.2.1 日留存(率)
/* 次日留存:1月1日注册的新用户,在1月2日登陆了app。即登陆日期 - 注册日期 = 1天。
3日留存:1月1日注册的新用户,在1月3日登陆了app。即登陆日期 - 注册日期 = 2天。
7日留存:1月1日注册的新用户,在1月8日登陆了app。即登陆日期 - 注册日期 = 6天。*/
-- 方法1。思路:登录日期-初次登录日期,=0则为新增,=1为次日留存.
SELECT a.min_date dt,
SUM(IF(DATEDIFF(b.date,a.min_date)=0,1,0)) "DAU",
SUM(IF(DATEDIFF(b.date,a.min_date)=1,1,0)) "次日留存人数",
SUM(IF(DATEDIFF(b.date,a.min_date)=2,1,0)) "三日留存人数",
SUM(IF(DATEDIFF(b.date,a.min_date)=6,1,0)) "七日留存人数",
SUM(IF(DATEDIFF(b.date,a.min_date)=1,1,0))/SUM(IF(DATEDIFF(b.date,a.min_date)=0,1,0)) "次日留存率",
SUM(IF(DATEDIFF(b.date,a.min_date)=2,1,0))/SUM(IF(DATEDIFF(b.date,a.min_date)=0,1,0)) "三日留存率",
SUM(IF(DATEDIFF(b.date,a.min_date)=6,1,0))/SUM(IF(DATEDIFF(b.date,a.min_date)=0,1,0)) "七日留存率"
FROM (
SELECT user_id,
MIN(date) min_date
FROM ub
GROUP BY user_id
) a
LEFT JOIN ub b
on a.user_id=b.user_id
GROUP BY a.min_date
ORDER BY a.min_date
-- 方法2。思路:自关联
SELECT t.min_date,
SUM(t.user_id_d0) "DAU",
SUM(t.user_id_d1) "次日留存人数",
SUM(t.user_id_d3) "三日留存用人数",
SUM(t.user_id_d7) "七日留存用人数"
FROM (
SELECT a.min_date,
IF(a.user_id IS NULL,0,1) user_id_d0,
IF(b.user_id IS NULL,0,1) AS user_id_d1,
IF(c.user_id IS NULL,0,1) AS user_id_d3,
IF(d.user_id IS NULL,0,1) AS user_id_d7
FROM (
SELECT user_id,
MIN(date) min_date
FROM ub
GROUP BY user_id
) a
LEFT JOIN ub b
ON DATEDIFF(b.date,a.min_date) = 1
AND a.user_id = b.user_id
LEFT JOIN ub c
ON DATEDIFF(c.date,a.min_date) = 2
AND a.user_id = c.user_id
LEFT JOIN ub d
ON DATEDIFF(d.date,a.min_date) = 6
AND a.user_id = d.user_id
) t
GROUP BY min_date
ORDER BY min_date
输出结果如下:
1.3 沉默用户
沉默用户:只在当天访问过,且访问时间在7天前的用户。
1.3.1 只在当天访问过且访问时间在7天前
SELECT COUNT(*) AS "沉默用户数"
FROM (
SELECT user_id
FROM ub
WHERE type='pv'
GROUP BY user_id
HAVING COUNT(*)=1
AND MIN(date)<DATE_SUB(date,7)
)
1.3.2 过去30天中最近一次访问在7天前
SELECT COUNT(user_id) AS "沉默用户数"
FROM (
SELECT user_id,
DATEDIFF(now(), time) AS diff
FROM (
SELECT user_id,
MAX(time) AS time
FROM ub
WHERE type='pv'
and date BETWEEN '[baseTime]' - INTERVAL '30' DAY AND '[baseTime]' - INTERVAL '1' DAY
GROUP BY 1
) a
)
WHERE diff>7
1.4 流失用户
流失用户:访问时间在一段时间前的用户。
1.4.1 访问时间在30天前
SELECT COUNT(*) AS "流失用户数"
FROM (
SELECT user_id
FROM ub
WHERE type='pv'
GROUP BY user_id
HAVING MAX(date)<DATE_SUB(date,30)
)
1.4.2 最近两次访问时间间隔在30天以上
SELECT COUNT(user_id) AS "流失用户数"
FROM (
SELECT user_id,
DATEDIFF(t2,t1) AS diff
FROM (
SELECT user_id,
LAG(time,1) OVER(PARTITION BY user_id ORDER BY time ASC) AS t1,
time AS t2
FROM ub
WHERE type='pv'
) t
GROUP BY user_id
) r
WHERE diff>30
2 用户消费行为分析
2.1 各类行为随日期的变化
SELECT a.type,
MAX(CASE WHEN a.day = DATE_SUB('[baseTime]',7) THEN a.type_count ELSE 0 end) as "2017-11-25",
MAX(CASE WHEN a.day = DATE_SUB('[baseTime]',6) THEN a.type_count ELSE 0 end) as "2017-11-26",
MAX(CASE WHEN a.day = DATE_SUB('[baseTime]',5) THEN a.type_count ELSE 0 end) as "2017-11-27",
MAX(CASE WHEN a.day = DATE_SUB('[baseTime]',4) THEN a.type_count ELSE 0 end) as "2017-11-28",
MAX(CASE WHEN a.day = DATE_SUB('[baseTime]',3) THEN a.type_count ELSE 0 end) as "2017-11-29",
MAX(CASE WHEN a.day = DATE_SUB('[baseTime]',2) THEN a.type_count ELSE 0 end) as "2017-11-30",
MAX(CASE WHEN a.day = DATE_SUB('[baseTime]',1) THEN a.type_count ELSE 0 end) as "2017-12-01'
from (
SELECT TO_DATE(date,'yyyymmdd') date, type,
COUNT(user_id) as type_count
FROM ub
WHERE type='buy'
WHERE date BETWEEN '[baseTime]' - INTERVAL '7' DAY AND '[baseTime]' - INTERVAL '1'DAY
GROUP BY date, type
) as a
GROUP BY a.type
输出结果如下:
2.2 各类行为转化率
SELECT type,
COUNT(user_id) as count,
ROUND((COUNT(user_id)/(SELECT COUNT(user_id) FROM ub WHERE type='pv')),3)*100 as Proportion
FROM ub
GROUP BY type
ORDER BY Proportion asc
输出结果如下:
2.3 月复购率
SELECT a.umonth,
SUM(IF(cnt>1,1,0) AS repurchase_con,
COUNT(1) AS customs_con,
CONCAT(ROUND(SUM(IF(cnt>1,1,0))/COUNT(1)*100,2),'%') AS repurchase_ratio
FROM (
SELECT to_date(date,'yyyymm') umonth,
user_id,
count(user_id) as cnt
from ub
where type='buy'
group by 1,
2
) a
GROUP BY a.umonth
输出结果如下:
2.4 复购间隔天数分布
SELECT gp_days.user_id,
min(gp_days.day) as min_day,
max(gp_days.day) as max_day,
DATEDIFF(max(gp_days.day),min(gp_days.day)) as DATEDIFF
FROM (
SELECT buy_all.user_id,
buy_all.day
from (
SELECT user_id,
day
from ub
where type='buy'
) as buy_all
join (
SELECT user_id,
count(user_id) as count
from ub
where type='buy'
group by user_id
having count(user_id)>2
) as buy_s
on buy_all.user_id=buy_s.user_id
) as gp_days
GROUP BY gp_days.user_id
输出结果如下:
2.5 过去 7 天购买偏好的商品类型(前 3)
-- 返回值是 list 类型,需要创建为集合类型的标签
SELECT user_id,
MAX(cate_id) AS cate_id,
CONCAT_WS(',',COLLECT_SET(item_id)) AS item_ids
FROM (
SELECT user_id,
cate_id,
item_id,
RANK() OVER (PARTITION BY user_id ORDER BY cnt DESC) AS rank_num
FROM (
SELECT user_id,
item_id,
MAX(cate_id) AS cate_id,
COUNT(*) AS cnt
FROM ub
WHERE date BETWEEN '[baseTime]' - INTERVAL '7' DAY AND '[baseTime]' - INTERVAL '1'DAY
AND type = 'buy'
GROUP BY 1,
2
) a
) b
WHERE rank_num <= 3
GROUP BY 1