1、原始表
CREATE EXTERNAL TABLE ods.user (
user_num STRING COMMENT '用户编号',
mobile STRING COMMENT '手机号码',
reg_date STRING COMMENT '注册日期'
)
COMMENT '用户资料表'
PARTITIONED BY (dt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n'
STORED AS ORC
LOCATION '/ods/user';
2、每日新增及更新表
CREATE EXTERNAL TABLE ods.user_update (
user_num STRING COMMENT '用户编号',
mobile STRING COMMENT '手机号码',
reg_date STRING COMMENT '注册日期'
COMMENT '每日用户资料更新表'
)
PARTITIONED BY (dt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n'
STORED AS ORC
LOCATION '/ods/user_update';
3、拉链表
CREATE EXTERNAL TABLE dws.user_his (
user_num STRING COMMENT '用户编号',
mobile STRING COMMENT '手机号码',
reg_date STRING COMMENT '用户编号',
t_start_date ,
t_end_date
)COMMENT '用户资料拉链表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n'
STORED AS ORC
LOCATION '/dws/user_his';
4、增量比对算法
算法说明:
ods.user_update表里存放新增的及有变更的数据,对于新删除了的数据没法记录,故叫做“增量比对”算法。
1、union前面的是按已有历史拉链表里计算,包含关链的(与update关联上的)及不需要改变的(与update关联不上的,包含end_time是9999-12-31的及end_time是其他时间的,即状态正常的及已经结束的)。
2、union后面的是新增开链的数据,包含纯新开链的及有变更的开链(在union之前,这部分已经关链)。
3、这里感觉应该使用union all,以提高效率,两边应该不会有重复数据。
INSERT OVERWRITE TABLE dws.user_his
SELECT * FROM
(
SELECT A.user_num,
A.mobile,
A.reg_date,
A.t_start_time,
CASE
WHEN A.t_end_time = '9999-12-31' AND B.user_num IS NOT NULL THEN '2017-01-01'
ELSE A.t_end_time
END AS t_end_time
FROM dws.user_his AS A
LEFT JOIN ods.user_update AS B
ON A.user_num = B.user_num
UNION
SELECT C.user_num,
C.mobile,
C.reg_date,
'2017-01-02' AS t_start_time,
'9999-12-31' AS t_end_time
FROM ods.user_update AS C
) AS T;
5、增量比对算法带参
INSERT OVERWRITE TABLE dws.user_his
SELECT * FROM
(
SELECT A.user_num,
A.mobile,
A.reg_date,
A.modified_time
A.t_start_time,
CASE
WHEN A.t_end_time = '9999-12-31' AND B.user_num IS NOT NULL THEN date_add($date,-1)
ELSE A.t_end_time
END AS t_end_time
FROM dws.user_his AS A
LEFT JOIN ods.user_update AS B
ON A.user_num = B.user_num
UNION
SELECT C.user_num,
C.mobile,
C.reg_date,
C.modified_time,
$date AS t_start_time,
'9999-12-31' AS t_end_time
FROM ods.user_update AS C
) AS T;
6、全量比对算法
ods.user_full是当日全量表,表结构略去。
INSERT OVERWRITE TABLE dws.user_his
SELECT * FROM
(
SELECT A.user_num,
A.mobile,
A.reg_date,
A.t_start_time,
CASE
WHEN A.t_end_time = '9999-12-31' AND B.user_num IS NULL THEN date_add($date,-1) --老拉链表里有,当天全量表里没有,则数据关链;新增不重开链
WHEN A.t_end_time = '9999-12-31' AND B.user_num IS NOT NULL and (A.mobile <> B.mobile or A.reg_date<>B.reg_date) THEN date_add($date,-1) --老拉链表和当天全量表都有,但值发生了变化,则数据关链;同时新增开链操作
ELSE A.t_end_time --其他(1、已经关链的数据;2、未关链但老拉链表和当天全量对比信息没有变化的数据)end_time不变
END t_end_time
FROM dws.user_his A
LEFT JOIN ods.user_full B
ON A.user_num = B.user_num
UNION ALL
SELECT A.user_num,
A.mobile,
A.reg_date,
$date t_start_time,
'9999-12-31' t_end_time
FROM ods.user_full A
INNER JOIN (select b1.*,row_number()over(partition by b1.user_num order by b1.t_end_time desc) rn from dws.user_his b1 where t_start_time<=date_add($date,-1)) B
ON A.user_num = B.user_num
WHERE b.rn=1 and (A.mobile <> B.mobile or A.reg_date<>B.reg_date) --比对值变更关链又新开链数据
UNION ALL
SELECT A.user_num,
A.mobile,
A.reg_date,
$date t_start_time,
'9999-12-31' t_end_time
FROM ods.user_full A
LEFT JOIN dws.user_his B
ON A.user_num = B.user_num
WHERE B.user_num IS NULL --纯新增开链数据
) T;
7、全量比对算法考虑回滚
拉链数据支持从历史上某一天跑到当天。跑数时需谨慎,因为一旦重跑,那天之后的数据将消失。
INSERT OVERWRITE TABLE dws.user_his
SELECT * FROM
(
SELECT A.user_num,
A.mobile,
A.reg_date,
A.t_start_time,
CASE
WHEN A.t_end_time = '9999-12-31' AND B.user_num IS NOT NULL and (A.mobile <> B.mobile or A.reg_date<>B.reg_date) THEN date_add($date,-1) --老拉链表和当天全量表都有,但值发生了变化,则数据关链;同时新增开链操作
WHEN A.t_end_time >= date_add($date,-1) AND B.user_num IS NULL THEN date_add($date,-1) --老拉链表里有,当天全量表里没有(包含状态正常的需要关链,及关链日期在date_add($date,-1)之后的两种情况),则数据关链到date_add($date,-1);新增不重开链
ELSE A.t_end_time --其他(1、已经关链且关链日期小于date_add($date,-1)的数据;2、未关链但老拉链表和当天全量对比信息没有变化的数据)end_time不变
END t_end_time
FROM dws.user_his A
LEFT JOIN ods.user_full B
ON A.user_num = B.user_num
WHERE A.t_start_time<=date_add($date,-1) --限制跑数$date的前一天数据日期,$date及其之后日期的数据都将删掉,所以,日期选择要慎重
UNION ALL
SELECT A.user_num,
A.mobile,
A.reg_date,
$date t_start_time,
'9999-12-31' t_end_time
FROM ods.user_full A
INNER JOIN (select b1.*,row_number()over(partition by b1.user_num order by b1.t_end_time desc) rn from dws.user_his b1 where t_start_time<=date_add($date,-1)) B
ON A.user_num = B.user_num
WHERE B.rn=1 and ((A.mobile <> B.mobile or A.reg_date<>B.reg_date) or B.t_end_time<date_add($date,-1)) --当前全量表数据与历史开始日期小于date_add($date,-1)的比对值变更关链又新开链数据;另外,包含了一部分内容没变化中间少若干天数据,后来又重新来过,重新开链的数据
UNION ALL
SELECT A.user_num,
A.mobile,
A.reg_date,
$date t_start_time,
'9999-12-31' t_end_time
FROM ods.user_full A
LEFT JOIN (select * from dws.user_his where t_start_time<=date_add($date,-1)) B --限制历史拉链表的数据为date_add($date,-1)及之前的
ON A.user_num = B.user_num
WHERE B.user_num IS NULL --纯新增开链数据
) T;