本人日常java开发一枚,由于公司产品需要开始做数据开发,用户留存分析,只有用户的访问日志:
访问时间,访问平台,设备类型,等 其他需要根据用户所有属性(国家、内外部用户,以及用户角色等来进行筛选统计)
用户留存 ---(日留存 周留存 月留存 )
某天注册的用户数,2日留存率, 7日留存率, 10日留存率, 30日留存率(比例)
2020-10-22 23 34% 45% 23%
2020-10-21 11 34% 45% 23%
2020-10-22 11 34% 45% 23%
刚开始不懂,打算按天去遍历查询汇总查询当前的新用户ids,再作为入参聚合不同查询条件维度的数据后来发现真是傻啊,咨询了厉害的越姐和网上查询的资料 分析得出 如下的表设计模式
1、首先 聚合出用户uid visitdate 作为唯一主键的date_visit_form_log; 记录每天用户访问记录
-- 用户访问天数记录表
drop table if EXISTS `uat_db`.t_basf_user_rention_form_log;
CREATE EXTERNAL TABLE `uat_db`.t_basf_user_rention_form_log(
uid bigint,
visit_date int,
form_visit_date String
);
insert overwrite table `uat_db`.t_basf_user_rention_form_log
select distinct uid,visit_date,from_unixtime(unix_timestamp(cast(visit_date as string),'yyyymmdd'),'yyyy-mm-dd') form_visit_date from `uat_db`.t_basf_user_data_form_log; -- 用户访问周记录表
drop table if EXISTS `uat_db`.t_basf_user_rention_form_week_log;
CREATE EXTERNAL TABLE `uat_db`.t_basf_user_rention_form_week_log(
uid bigint,
visit_weekth int
);
insert overwrite table `uat_db`.t_basf_user_rention_form_week_log
select distinct uid,visit_weekth from `uat_db`.t_basf_user_data_form_log;2、基于上面的记录表 来统计1天 3天 7天 30天的用户记录
-- 用户留存数据表 (hive脚本 由于不支持left join的后面的and关键词 导致left join了很多内容,如果有网友看出问题能更优化一下脚本 不胜感激)
select temp.*, h_basf_user.is_internal, h_basf_user.country_id ,user_role_list.role_ids from (select c.visit_date visit_date,
c.uid new_uid,
c.sub_platform,
d.uid oneday_uid,
e.uid threeDay_uid,
f.uid sevenDay_uid,
g.uid thirtyDay_uid
from (select temp.*,log.sub_platform from (select distinct a.uid, a.visit_date,a.form_visit_date
from `uat_db`.t_basf_user_rention_form_log a
left join `default`.h_basf_user b on a.uid = b.id
where datediff(a.form_visit_date,cast(b.register_date as String))=0) temp
left join (select * from (
select
uid , visit_date ,visit_time, sub_platform,
row_number() over ( partition by uid order by visit_time asc ) num
from
`uat_db`.t_basf_user_data_form_log
) last
where last.num = 1 ) as log on temp.uid = log.uid
where temp.visit_date = log.visit_date
) c
left join
(select a.*
from `uat_db`.t_basf_user_rention_form_log a
left join `default`.h_basf_user b on a.uid = b.id
where datediff(a.form_visit_date,cast(b.register_date as String))=1
) d on c.uid = d.uid
left join
( select a.*
from `uat_db`.t_basf_user_rention_form_log a
left join `default`.h_basf_user b on a.uid = b.id
where datediff(a.form_visit_date,cast(b.register_date as String))=3
) e on c.uid = e.uid
left join
( select a.*
from `uat_db`.t_basf_user_rention_form_log a
left join `default`.h_basf_user b on a.uid = b.id
where datediff(a.form_visit_date,cast(b.register_date as String))=7
) f on c.uid = f.uid
left join
( select a.*
from `uat_db`.t_basf_user_rention_form_log a
left join `default`.h_basf_user b on a.uid = b.id
where datediff(a.form_visit_date,cast(b.register_date as String))=30
) g on c.uid = g.uid) temp left join `default`.h_basf_user on temp.new_uid = h_basf_user.id
left join (select
user_id, concat_ws(',', collect_list(cast (role_id as string))) as role_ids
from
`default`.t_basf_user_role_relation
group by
user_id) as user_role_list on temp.new_uid = user_role_list.user_id;
得到的结果集推送到ES,根据visit_date 聚合 不同的时间间隔访问人数;基本符合需求了
周留存和月留存类似 ,只是时间维度的切换 其他都类似的处理流程;
随笔一个 ,如果有遇到类似的问题 可以借鉴下
3、后来组内讨论发现我这个脚本太复杂,优化方案如下:
---3.1、用户留存新的基础表 日 周 月间隔数据
drop table ${toDB}.t_user_visit_day;
set hive.execution.engine=mr;
create table ${toDB}.t_user_visit_day as
select
a.*,
datediff(a.form_visit_date, cast(b.register_date as String)) as datediff_day,
if( b.register_date IS NULL ,99999999,`default`.diffweek(a.form_visit_date, b.register_date)) as datediff_week,
floor(months_between(a.form_visit_date, b.register_date)) as datediff_month,
concat(year(a.form_visit_date), lpad(weekofyear(a.form_visit_date),2,0)) as visit_weekth,
date_format(a.form_visit_date,'yyyyMM') visit_month,
b.register_date,
concat(year(b.register_date), lpad(weekofyear(b.register_date),2,0)) as register_weekth,
date_format(b.register_date,'yyyyMM') register_month
from
${toDB}.t_basf_user_rention_form_log a
left join ${baseDB}.h_basf_user b on
a.uid = b.id
where a.uid > 0 ;
直接按照不同的访问日期计算和注册时间直接的天数差、周数差、月数差 作为一个大的明细表;
---2、索引的数据源
选取需要的差数,按照日、周、月聚合组合数据推送到ES中
select
u.register_date_num,
u.register_week,
u.register_month,
u.id,
user_re.sub_platform,
u.is_internal as internal_id,
u.country_id,
u.company_id,
u.role_array,
user_datediff_day.datediff_day_array,
user_datediff_week.datediff_week_array,
user_datediff_month.datediff_month_array
from
uat_db.t_basf_user_base u
left join(
select
uid,
collect_list(datediff_day) as datediff_day_array
from
uat_db.t_user_visit_day
where
datediff_day in(1, 2, 3, 4, 5, 6, 30)
group by
uid ) as user_datediff_day on
user_datediff_day.uid = u.id
left join(
select
uid,
collect_list(distinct datediff_week) as datediff_week_array
from
uat_db.t_user_visit_day
where
datediff_week in(1,2,3, 4,5,6,7,8,9, 10)
group by
uid ) as user_datediff_week on
user_datediff_week.uid = u.id
left join(
select
uid,
collect_list(distinct datediff_month) as datediff_month_array
from
uat_db.t_user_visit_day
where
datediff_month in(1, 2, 3)
group by
uid ) as user_datediff_month on
user_datediff_month.uid = u.id
left join (
select uid,visit_date , visit_time, sub_platform from (
select
uid , visit_date , visit_time, sub_platform, row_number() over ( partition by uid order by visit_time asc ) num
from
uat_db.t_basf_user_data_form_log ) last
where
last.num = 1) as user_re on user_re.uid = u.id where user_re.uid = 252 limit 10;
这个表结构 精简了数据脚本,合并了中间表 不失为一个好方法
4、附上 如果能left join 加上and的条件脚本 简单易懂
select
c.visit_date visit_date,
count(distinct c.uid) dayNewUser,
count(distinct d.uid) newDateNewUser,
count(distinct e.uid) threeDateNewUser,
count(distinct f.uid) sevenDateNewUser,
count(distinct g.uid) thirtyDateNewUser
from
(
select a.*, b.visit_date bvisit_date
from `uat_db`.t_basf_user_rention_form_log a
left join `uat_db`.t_basf_user_rention_form_log b on a.uid = b.uid and b.visit_date < a.visit_date
where b.visit_date is null
) c
left join `uat_db`.t_basf_user_data_form_log d on c.uid = d.uid and DATEDIFF(from_unixtime(unix_timestamp(cast(d.visit_date as string),'yyyymmdd'),'yyyy-mm-dd'),from_unixtime(unix_timestamp(cast(c.visit_date as string),'yyyymmdd'),'yyyy-mm-dd'))= 1
left join `uat_db`.t_basf_user_data_form_log e on c.uid = e.uid and DATEDIFF(from_unixtime(unix_timestamp(cast(e.visit_date as string),'yyyymmdd'),'yyyy-mm-dd'),from_unixtime(unix_timestamp(cast(c.visit_date as string),'yyyymmdd'),'yyyy-mm-dd'))= 3
left join `uat_db`.t_basf_user_data_form_log f on c.uid = f.uid and DATEDIFF(from_unixtime(unix_timestamp(cast(f.visit_date as string),'yyyymmdd'),'yyyy-mm-dd'),from_unixtime(unix_timestamp(cast(c.visit_date as string),'yyyymmdd'),'yyyy-mm-dd'))= 7
left join `uat_db`.t_basf_user_data_form_log g on c.uid = g.uid and DATEDIFF(from_unixtime(unix_timestamp(cast(g.visit_date as string),'yyyymmdd'),'yyyy-mm-dd'),from_unixtime(unix_timestamp(cast(c.visit_date as string),'yyyymmdd'),'yyyy-mm-dd'))= 30
group by c.visit_date;