8. 数仓开发之 DIM 层

  • 商品维度表
  • 流程汇总图
  • SKU信息表(sku_info)
  • SPU信息表(spu_info)
  • 一级分类表(base_category1)
  • 二级分类表(base_category2)
  • 三级分类表(base_category3)
  • 品牌表(base_trademark)
  • SKU平台属性值表(sku_attr_value)
  • SKU销售属性表(sku_sale_attr_value)
  • 建表语句
  • 数据装载
  • 优惠券维度表
  • 流程汇总图
  • 建表语句
  • 数据装载
  • 活动维度表
  • 流程汇总图
  • 建表语句
  • 数据装载
  • 地区维度表
  • 流程汇总图
  • 建表语句
  • 数据装载
  • 日期维度表
  • 流程汇总图
  • 建表语句
  • 数据装载
  • 用户维度表
  • 建表语句
  • 分区规划
  • 数据装载
  • 数据装载过程
  • 数据流向
  • 首日装载
  • 每日装载
  • 数据装载脚本
  • 首日装载脚本
  • 每日装载脚本


DIM 层设计要点:

  • DIM 层的设计依据 : 维度建模理论,该层存储维度模型维度表
  • DIM 层的数据存储格式 : orc 列式存储 + snappy 压缩
  • DIM 层表名的命名规范为 dim_表名_全量表 或 拉链表标识( full / zip )

商品维度表

流程汇总图

数据仓库表名规范 数据仓库dim_hadoop

SKU信息表(sku_info)

字段名

字段说明

类型

id

库存id(itemID)

bigint(20)

spu_id

商品id

bigint(20)

price

价格

decimal(10,0)

sku_name

sku名称

varchar(200)

sku_desc

商品规格描述

varchar(2000)

weight

重量

decimal(10,2)

tm_id

品牌(冗余)

bigint(20)

category3_id

三级分类id(冗余)

bigint(20)

sku_default_img

默认显示图片(冗余)

varchar(300)

is_sale

是否销售(1:是 0:否)

tinyint(3)

create_time

创建时间

datetime

SPU信息表(spu_info)

字段名

字段说明

类型

id

商品id

bigint(20)

spu_name

商品名称

varchar(200)

description

商品描述(后台简述)

varchar(1000)

category3_id

三级分类id

bigint(20)

tm_id

品牌id

bigint(20)

一级分类表(base_category1)

字段名

字段说明

类型

id

编号

bigint(20)

name

分类名称

varchar(10)

二级分类表(base_category2)

字段名

字段说明

类型

id

编号

bigint(20)

name

二级分类名称

varchar(200)

category1_id

一级分类编号

bigint(20)

三级分类表(base_category3)

字段名

字段说明

类型

id

编号

bigint(20)

name

三级分类名称

varchar(200)

category2_id

二级分类编号

bigint(20)

品牌表(base_trademark)

字段名

字段说明

类型

id

编号

bigint(20)

tm_name

属性值

varchar(100)

logo_url

品牌logo的图片路径

varchar(200)

SKU平台属性值表(sku_attr_value)

字段名

字段说明

类型

id

编号

bigint(20)

attr_id

属性id(冗余)

bigint(20)

value_id

属性值id

bigint(20)

sku_id

skuid

bigint(20)

attr_name

属性名称

varchar(30)

value_name

属性值名称

varchar(30)

SKU销售属性表(sku_sale_attr_value)

字段名

字段说明

类型

id

id

bigint(20)

sku_id

库存单元id

bigint(20)

spu_id

spu_id(冗余)

int(11)

sale_attr_value_id

销售属性值id

bigint(20)

sale_attr_id

销售属性id

bigint(20)

sale_attr_name

销售属性名

varchar(30)

sale_attr_value_name

销售属性值名称

varchar(30)

建表语句

-- 删除 商品维度表

DROP TABLE IF EXISTS dim_sku_full;

-- 创建 商品维度表

CREATE EXTERNAL TABLE dim_sku_full
(
    `id`                   STRING COMMENT 'sku_id',
    `price`                DECIMAL(16, 2) COMMENT '商品价格',
    `sku_name`             STRING COMMENT '商品名称',
    `sku_desc`             STRING COMMENT '商品描述',
    `weight`               DECIMAL(16, 2) COMMENT '重量',
    `is_sale`              BOOLEAN COMMENT '是否在售',
    `spu_id`               STRING COMMENT 'spu编号',
    `spu_name`             STRING COMMENT 'spu名称',
    `category3_id`         STRING COMMENT '三级分类id',
    `category3_name`       STRING COMMENT '三级分类名称',
    `category2_id`         STRING COMMENT '二级分类id',
    `category2_name`       STRING COMMENT '二级分类名称',
    `category1_id`         STRING COMMENT '一级分类id',
    `category1_name`       STRING COMMENT '一级分类名称',
    `tm_id`                STRING COMMENT '品牌id',
    `tm_name`              STRING COMMENT '品牌名称',
    `sku_attr_values`      ARRAY<STRUCT<attr_id :STRING, value_id :STRING, attr_name :STRING, value_name
                                        :STRING>> COMMENT '平台属性',
    `sku_sale_attr_values` ARRAY<STRUCT<sale_attr_id :STRING,sale_attr_value_id :STRING,sale_attr_name :STRING,sale_attr_value_name
                                        :STRING>> COMMENT '销售属性',
    `create_time`          STRING COMMENT '创建时间'
) COMMENT '商品维度表'
    PARTITIONED BY (`dt` STRING)
    STORED AS ORC
    LOCATION '/warehouse/gmall/dim/dim_sku_full/'
    TBLPROPERTIES ('orc.compress' = 'snappy');

数据装载

-- 装载 商品维度表

with sku as (
    select id,
           spu_id,
           price,
           sku_name,
           sku_desc,
           weight,
           tm_id,
           category3_id,
           is_sale,
           create_time
    from ods_sku_info_full
    where dt = '2020-06-14'
),
     spu as (
         select id,
                spu_name
         from ods_spu_info_full
         where dt = '2020-06-14'
     ),
     c3 as (
         select id,
                name,
                category2_id
         from ods_base_category3_full
         where dt = '2020-06-14'
     ),
     c2 as (
         select id,
                name,
                category1_id
         from ods_base_category2_full
         where dt = '2020-06-14'
     ),
     c1 as (
         select id,
                name
         from ods_base_category1_full
         where dt = '2020-06-14'
     ),
     tm as (
         select id,
                tm_name
         from ods_base_trademark_full
         where dt = '2020-06-14'
     ),
     attr as (
         select sku_id,
                collect_set(named_struct('attr_id', attr_id, 'value_id', value_id, 'attr_name', attr_name, 'value_name',
                                         value_name)) attrs
         from ods_sku_attr_value_full
         where dt = '2020-06-14'
         group by sku_id
     ),
     sale_attr as (
         select sku_id,
                collect_set(named_struct('sale_attr_id', sale_attr_id, 'sale_attr_value_id', sale_attr_value_id,
                                         'sale_attr_name', sale_attr_name, 'sale_attr_value_name',
                                         sale_attr_value_name)) sale_attrs
         from ods_sku_sale_attr_value_full
         where dt = '2020-06-14'
         group by sku_id
     )
insert
overwrite
table
dim_sku_full
partition
(
dt = '2020-06-14'
)
select sku.id,
       sku.price,
       sku.sku_name,
       sku.sku_desc,
       sku.weight,
       sku.is_sale,
       sku.spu_id,
       spu.spu_name,
       sku.category3_id,
       c3.name,
       c3.category2_id,
       c2.name,
       c2.category1_id,
       c1.name,
       sku.tm_id,
       tm.tm_name,
       attr.attrs,
       sale_attr.sale_attrs,
       sku.create_time
from sku
         left join spu on sku.spu_id = spu.id
         left join c3 on sku.category3_id = c3.id
         left join c2 on c3.category2_id = c2.id
         left join c1 on c2.category1_id = c1.id
         left join tm on sku.tm_id = tm.id
         left join attr on sku.id = attr.sku_id
         left join sale_attr on sku.id = sale_attr.sku_id;

优惠券维度表

流程汇总图

数据仓库表名规范 数据仓库dim_大数据_02

建表语句

-- 建 优惠券维度表

DROP TABLE IF EXISTS dim_coupon_full;

CREATE EXTERNAL TABLE dim_coupon_full
(
    `id`               STRING COMMENT '购物券编号',
    `coupon_name`      STRING COMMENT '购物券名称',
    `coupon_type_code` STRING COMMENT '购物券类型编码',
    `coupon_type_name` STRING COMMENT '购物券类型名称',
    `condition_amount` DECIMAL(16, 2) COMMENT '满额数',
    `condition_num`    BIGINT COMMENT '满件数',
    `activity_id`      STRING COMMENT '活动编号',
    `benefit_amount`   DECIMAL(16, 2) COMMENT '减金额',
    `benefit_discount` DECIMAL(16, 2) COMMENT '折扣',
    `benefit_rule`     STRING COMMENT '优惠规则:满元*减*元,满*件打*折',
    `create_time`      STRING COMMENT '创建时间',
    `range_type_code`  STRING COMMENT '优惠范围类型编码',
    `range_type_name`  STRING COMMENT '优惠范围类型名称',
    `limit_num`        BIGINT COMMENT '最多领取次数',
    `taken_count`      BIGINT COMMENT '已领取次数',
    `start_time`       STRING COMMENT '可以领取的开始日期',
    `end_time`         STRING COMMENT '可以领取的结束日期',
    `operate_time`     STRING COMMENT '修改时间',
    `expire_time`      STRING COMMENT '过期时间'
) COMMENT '优惠券维度表'
    PARTITIONED BY (`dt` STRING)
    STORED AS ORC
    LOCATION '/warehouse/gmall/dim/dim_coupon_full/'
    TBLPROPERTIES ('orc.compress' = 'snappy');

数据装载

-- 装载 优惠券维度表
insert overwrite table dim_coupon_full
    partition (dt = '2020-06-14')
select ci.id,
       ci.coupon_name,
       ci.coupon_type,
       coupon_dic.dic_name,
       ci.condition_amount,
       ci.condition_num,
       ci.activity_id,
       ci.benefit_amount,
       ci.benefit_discount,
       case coupon_type
           when '3201' then concat('满', condition_amount, '元减', benefit_amount, '元')
           when '3202' then concat('满', condition_num, '件打', 10 * (1 - benefit_discount), '折')
           when '3203' then concat('减', benefit_amount, '元')
           end benefit_rule,
       ci.create_time,
       ci.range_type,
       range_dic.dic_name,
       ci.limit_num,
       ci.taken_count,
       ci.start_time,
       ci.end_time,
       ci.operate_time,
       ci.expire_time
from (
         select id,
                coupon_name,
                coupon_type,
                condition_amount,
                condition_num,
                activity_id,
                benefit_amount,
                benefit_discount,
                create_time,
                range_type,
                limit_num,
                taken_count,
                start_time,
                end_time,
                operate_time,
                expire_time
         from ods_coupon_info_full
         where dt = '2020-06-14'
     ) ci
         left join (
    select dic_code,
           dic_name
    from ods_base_dic_full
    where dt = '2020-06-14'
      and parent_code = '32'
) coupon_dic
                   on
                       ci.coupon_type = coupon_dic.dic_code
         left join(
    select dic_code,
           dic_name
    from ods_base_dic_full
    where dt = '2020-06-14'
      and parent_code = '33'
) range_dic
                  on
                      range_dic.dic_code = ci.range_type;

活动维度表

流程汇总图

数据仓库表名规范 数据仓库dim_hdfs_03

建表语句

-- 创建 活动维度表

DROP TABLE IF EXISTS dim_activity_full;

CREATE EXTERNAL TABLE dim_activity_full
(
    `activity_rule_id`   STRING COMMENT '活动规则ID',
    `activity_id`        STRING COMMENT '活动ID',
    `activity_name`      STRING COMMENT '活动名称',
    `activity_type_code` STRING COMMENT '活动类型编码',
    `activity_type_name` STRING COMMENT '活动类型名称',
    `activity_desc`      STRING COMMENT '活动描述',
    `start_time`         STRING COMMENT '开始时间',
    `end_time`           STRING COMMENT '结束时间',
    `create_time`        STRING COMMENT '创建时间',
    `condition_amount`   DECIMAL(16, 2) COMMENT '满减金额',
    `condition_num`      BIGINT COMMENT '满减件数',
    `benefit_amount`     DECIMAL(16, 2) COMMENT '优惠金额',
    `benefit_discount`   DECIMAL(16, 2) COMMENT '优惠折扣',
    `benefit_rule`       STRING COMMENT '优惠规则',
    `benefit_level`      STRING COMMENT '优惠级别'
) COMMENT '活动信息表'
    PARTITIONED BY (`dt` STRING)
    STORED AS ORC
    LOCATION '/warehouse/gmall/dim/dim_activity_full/'
    TBLPROPERTIES ('orc.compress' = 'snappy');

数据装载

-- 装载 活动维度表
insert overwrite table dim_activity_full
    partition (dt = '2020-06-14')
select rule.id,
       info.id,
       activity_name,
       rule.activity_type,
       dic.dic_name,
       activity_desc,
       start_time,
       end_time,
       create_time,
       condition_amount,
       condition_num,
       benefit_amount,
       benefit_discount,
       case rule.activity_type
           when '3101' then concat('满', condition_amount, '元减', benefit_amount, '元')
           when '3102' then concat('满', condition_num, '件打', 10 * (1 - benefit_discount), '折')
           when '3103' then concat('打', 10 * (1 - benefit_discount), '折')
           end benefit_rule,
       benefit_level
from (
         select id,
                activity_id,
                activity_type,
                condition_amount,
                condition_num,
                benefit_amount,
                benefit_discount,
                benefit_level
         from ods_activity_rule_full
         where dt = '2020-06-14'
     ) rule
         left join (
    select id,
           activity_name,
           activity_type,
           activity_desc,
           start_time,
           end_time,
           create_time
    from ods_activity_info_full
    where dt = '2020-06-14'
) info
                   on rule.activity_id = info.id
         left join (
    select dic_code,
           dic_name
    from ods_base_dic_full
    where dt = '2020-06-14'
      and parent_code = '31'
) dic
                   on rule.activity_type = dic.dic_code;

地区维度表

流程汇总图

建表语句

-- 地区维度表

DROP TABLE IF EXISTS dim_province_full;

CREATE EXTERNAL TABLE dim_province_full
(
    `id`            STRING COMMENT 'id',
    `province_name` STRING COMMENT '省市名称',
    `area_code`     STRING COMMENT '地区编码',
    `iso_code`      STRING COMMENT '旧版ISO-3166-2编码,供可视化使用',
    `iso_3166_2`    STRING COMMENT '新版IOS-3166-2编码,供可视化使用',
    `region_id`     STRING COMMENT '地区id',
    `region_name`   STRING COMMENT '地区名称'
) COMMENT '地区维度表'
    PARTITIONED BY (`dt` STRING)
    STORED AS ORC
    LOCATION '/warehouse/gmall/dim/dim_province_full/'
    TBLPROPERTIES ('orc.compress' = 'snappy');

数据装载

-- 地区维度表

insert overwrite table dim_province_full
    partition (dt = '2020-06-14')
select province.id,
       province.name,
       province.area_code,
       province.iso_code,
       province.iso_3166_2,
       province.region_id,
       region.region_name
from (
         select id,
                name,
                region_id,
                area_code,
                iso_code,
                iso_3166_2
         from ods_base_province_full
         where dt = '2020-06-14') province
         left join (
    select id,
           region_name
    from ods_base_region_full
    where dt = '2020-06-14'
) region
                   on
                       province.region_id = region.id;

日期维度表

流程汇总图

建表语句

-- 创建 日期维度表

DROP TABLE IF EXISTS dim_date;

CREATE EXTERNAL TABLE dim_date
(
    `date_id`    STRING COMMENT '日期ID',
    `week_id`    STRING COMMENT '周ID,一年中的第几周',
    `week_day`   STRING COMMENT '周几',
    `day`        STRING COMMENT '每月的第几天',
    `month`      STRING COMMENT '一年中的第几月',
    `quarter`    STRING COMMENT '一年中的第几季度',
    `year`       STRING COMMENT '年份',
    `is_workday` STRING COMMENT '是否是工作日',
    `holiday_id` STRING COMMENT '节假日'
) COMMENT '时间维度表'
    STORED AS ORC
    LOCATION '/warehouse/gmall/dim/dim_date/'
    TBLPROPERTIES ('orc.compress' = 'snappy');

数据装载

时间维度表的数据并不是来自于业务系统,而是手动写入,并且由于时间维度表数据的可预见性,无须每日导入,一般可一次性导入一年的数据

创建临时表

-- 创建 日期临时表
DROP TABLE IF EXISTS tmp_dim_date_info;

CREATE EXTERNAL TABLE tmp_dim_date_info
(
    `date_id`    STRING COMMENT '日',
    `week_id`    STRING COMMENT '周ID',
    `week_day`   STRING COMMENT '周几',
    `day`        STRING COMMENT '每月的第几天',
    `month`      STRING COMMENT '第几月',
    `quarter`    STRING COMMENT '第几季度',
    `year`       STRING COMMENT '年',
    `is_workday` STRING COMMENT '是否是工作日',
    `holiday_id` STRING COMMENT '节假日'
) COMMENT '时间维度表'
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
    LOCATION '/warehouse/gmall/tmp/tmp_dim_date_info/';

将数据文件上传到HFDS上临时表路径 /warehouse/gmall/tmp/tmp_dim_date_info

数据仓库表名规范 数据仓库dim_hdfs_04

hadoop fs -put /opt/software/date_info.txt /warehouse/gmall/tmp/tmp_dim_date_info

数据仓库表名规范 数据仓库dim_大数据_05

将其导入时间维度表

-- 导入时间维度表

insert overwrite table dim_date
select *
from tmp_dim_date_info;

检查数据是否导入成功

-- 查询日期维度
select *
from dim_date;

用户维度表

建表语句

-- 用户维度表

DROP TABLE IF EXISTS dim_user_zip;

CREATE EXTERNAL TABLE dim_user_zip
(
    `id`           STRING COMMENT '用户id',
    `login_name`   STRING COMMENT '用户名称',
    `nick_name`    STRING COMMENT '用户昵称',
    `name`         STRING COMMENT '用户姓名',
    `phone_num`    STRING COMMENT '手机号码',
    `email`        STRING COMMENT '邮箱',
    `user_level`   STRING COMMENT '用户等级',
    `birthday`     STRING COMMENT '生日',
    `gender`       STRING COMMENT '性别',
    `create_time`  STRING COMMENT '创建时间',
    `operate_time` STRING COMMENT '操作时间',
    `start_date`   STRING COMMENT '开始日期',
    `end_date`     STRING COMMENT '结束日期'
) COMMENT '用户表'
    PARTITIONED BY (`dt` STRING)
    STORED AS ORC
    LOCATION '/warehouse/gmall/dim/dim_user_zip/'
    TBLPROPERTIES ('orc.compress' = 'snappy');

分区规划

用户拉链表分区

数据仓库表名规范 数据仓库dim_hdfs_06

数据装载

数据装载过程

数据仓库表名规范 数据仓库dim_hdfs_07

数据流向

数据仓库表名规范 数据仓库dim_大数据_08

首日装载

-- 首日装载
insert overwrite table dim_user_zip
    partition (dt = '9999-12-31')
select data.id,
       data.login_name,
       data.nick_name,
       md5(data.name),
       md5(data.phone_num),
       md5(data.email),
       data.user_level,
       data.birthday,
       data.gender,
       data.create_time,
       data.operate_time,
       '2020-06-14' start_date,
       '9999-12-31' end_date
from ods_user_info_inc
where dt = '2020-06-14'
  and type = 'bootstrap-insert';

每日装载

装载思路

数据仓库表名规范 数据仓库dim_hadoop_09

装载语句

-- 每日装载

with tmp as (
    select old.id           old_id,
           old.login_name   old_login_name,
           old.nick_name    old_nick_name,
           old.name         old_name,
           old.phone_num    old_phone_num,
           old.email        old_email,
           old.user_level   old_user_level,
           old.birthday     old_birthday,
           old.gender       old_gender,
           old.create_time  old_create_time,
           old.operate_time old_operate_time,
           old.start_date   old_start_date,
           old.end_date     old_end_date,
           new.id           new_id,
           new.login_name   new_login_name,
           new.nick_name    new_nick_name,
           new.name         new_name,
           new.phone_num    new_phone_num,
           new.email        new_email,
           new.user_level   new_user_level,
           new.birthday     new_birthday,
           new.gender       new_gender,
           new.create_time  new_create_time,
           new.operate_time new_operate_time,
           new.start_date   new_start_date,
           new.end_date     new_end_date
    from (
             select id,
                    login_name,
                    nick_name,
                    name,
                    phone_num,
                    email,
                    user_level,
                    birthday,
                    gender,
                    create_time,
                    operate_time,
                    start_date,
                    end_date
             from dim_user_zip
             where dt = '9999-12-31'
         ) old
             full outer join(
        select id,
               login_name,
               nick_name,
               md5(name)      name,
               md5(phone_num) phone_num,
               md5(email)     email,
               user_level,
               birthday,
               gender,
               create_time,
               operate_time,
               '2020-06-15'   start_date,
               '9999-12-31'   end_date
        from (select data.id,
                     data.login_name,
                     data.nick_name,
                     data.name,
                     data.phone_num,
                     data.email,
                     data.user_level,
                     data.birthday,
                     data.gender,
                     data.create_time,
                     data.operate_time,
                     row_number() over (partition by data.id order by ts desc) rn
              from ods_user_info_inc
              where dt = '2020-06-15'
             ) t1
        where rn = 1
    ) new
                            on old.id = new.id
)
insert
overwrite
table
dim_user_zip
partition
(
dt
)
select if(new_id is not null, new_id, old_id),
       if(new_id is not null, new_login_name, old_login_name),
       if(new_id is not null, new_nick_name, old_nick_name),
       if(new_id is not null, new_name, old_name),
       if(new_id is not null, new_phone_num, old_phone_num),
       if(new_id is not null, new_email, old_email),
       if(new_id is not null, new_user_level, old_user_level),
       if(new_id is not null, new_birthday, old_birthday),
       if(new_id is not null, new_gender, old_gender),
       if(new_id is not null, new_create_time, old_create_time),
       if(new_id is not null, new_operate_time, old_operate_time),
       if(new_id is not null, new_start_date, old_start_date),
       if(new_id is not null, new_end_date, old_end_date),
       if(new_id is not null, new_end_date, old_end_date) dt
from tmp
union all
select old_id,
       old_login_name,
       old_nick_name,
       old_name,
       old_phone_num,
       old_email,
       old_user_level,
       old_birthday,
       old_gender,
       old_create_time,
       old_operate_time,
       old_start_date,
       cast(date_add('2020-06-15', -1) as string) old_end_date,
       cast(date_add('2020-06-15', -1) as string) dt
from tmp
where old_id is not null
  and new_id is not null;

数据装载脚本

首日装载脚本

在 cpu101 的 /home/cpu/bin 目录下创建 ods_to_dim_init.sh

vim ods_to_dim_init.sh

内容 :

#!/bin/bash

APP=gmall

if [ -n "$2" ] ;then
   do_date=$2
else 
   echo "请传入日期参数"
   exit
fi 

dim_user_zip="
insert overwrite table ${APP}.dim_user_zip partition (dt='9999-12-31')
select
    data.id,
    data.login_name,
    data.nick_name,
    md5(data.name),
    md5(data.phone_num),
    md5(data.email),
    data.user_level,
    data.birthday,
    data.gender,
    data.create_time,
    data.operate_time,
    '$do_date' start_date,
    '9999-12-31' end_date
from ${APP}.ods_user_info_inc
where dt='$do_date'
and type='bootstrap-insert';
"

dim_sku_full="
with
sku as
(
    select
        id,
        price,
        sku_name,
        sku_desc,
        weight,
        is_sale,
        spu_id,
        category3_id,
        tm_id,
        create_time
    from ${APP}.ods_sku_info_full
    where dt='$do_date'
),
spu as
(
    select
        id,
        spu_name
    from ${APP}.ods_spu_info_full
    where dt='$do_date'
),
c3 as
(
    select
        id,
        name,
        category2_id
    from ${APP}.ods_base_category3_full
    where dt='$do_date'
),
c2 as
(
    select
        id,
        name,
        category1_id
    from ${APP}.ods_base_category2_full
    where dt='$do_date'
),
c1 as
(
    select
        id,
        name
    from ${APP}.ods_base_category1_full
    where dt='$do_date'
),
tm as
(
    select
        id,
        tm_name
    from ${APP}.ods_base_trademark_full
    where dt='$do_date'
),
attr as
(
    select
        sku_id,
        collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
    from ${APP}.ods_sku_attr_value_full
    where dt='$do_date'
    group by sku_id
),
sale_attr as
(
    select
        sku_id,
        collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
    from ${APP}.ods_sku_sale_attr_value_full
    where dt='$do_date'
    group by sku_id
)
insert overwrite table ${APP}.dim_sku_full partition(dt='$do_date')
select
    sku.id,
    sku.price,
    sku.sku_name,
    sku.sku_desc,
    sku.weight,
    sku.is_sale,
    sku.spu_id,
    spu.spu_name,
    sku.category3_id,
    c3.name,
    c3.category2_id,
    c2.name,
    c2.category1_id,
    c1.name,
    sku.tm_id,
    tm.tm_name,
    attr.attrs,
    sale_attr.sale_attrs,
    sku.create_time
from sku
left join spu on sku.spu_id=spu.id
left join c3 on sku.category3_id=c3.id
left join c2 on c3.category2_id=c2.id
left join c1 on c2.category1_id=c1.id
left join tm on sku.tm_id=tm.id
left join attr on sku.id=attr.sku_id
left join sale_attr on sku.id=sale_attr.sku_id;
"

dim_province_full="
insert overwrite table ${APP}.dim_province_full partition(dt='$do_date')
select
    province.id,
    province.name,
    province.area_code,
    province.iso_code,
    province.iso_3166_2,
    region_id,
    region_name
from
(
    select
        id,
        name,
        region_id,
        area_code,
        iso_code,
        iso_3166_2
    from ${APP}.ods_base_province_full
    where dt='$do_date'
)province
left join
(
    select
        id,
        region_name
    from ${APP}.ods_base_region_full
    where dt='$do_date'
)region
on province.region_id=region.id;
"

dim_coupon_full="
insert overwrite table ${APP}.dim_coupon_full partition(dt='$do_date')
select
    id,
    coupon_name,
    coupon_type,
    coupon_dic.dic_name,
    condition_amount,
    condition_num,
    activity_id,
    benefit_amount,
    benefit_discount,
    case coupon_type
        when '3201' then concat('满',condition_amount,'元减',benefit_amount,'元')
        when '3202' then concat('满',condition_num,'件打',10*(1-benefit_discount),'折')
        when '3203' then concat('减',benefit_amount,'元')
    end benefit_rule,
    create_time,
    range_type,
    range_dic.dic_name,
    limit_num,
    taken_count,
    start_time,
    end_time,
    operate_time,
    expire_time
from
(
    select
        id,
        coupon_name,
        coupon_type,
        condition_amount,
        condition_num,
        activity_id,
        benefit_amount,
        benefit_discount,
        create_time,
        range_type,
        limit_num,
        taken_count,
        start_time,
        end_time,
        operate_time,
        expire_time
    from ${APP}.ods_coupon_info_full
    where dt='$do_date'
)ci
left join
(
    select
        dic_code,
        dic_name
    from ${APP}.ods_base_dic_full
    where dt='$do_date'
    and parent_code='32'
)coupon_dic
on ci.coupon_type=coupon_dic.dic_code
left join
(
    select
        dic_code,
        dic_name
    from ${APP}.ods_base_dic_full
    where dt='$do_date'
    and parent_code='33'
)range_dic
on ci.range_type=range_dic.dic_code;
"

dim_activity_full="
insert overwrite table ${APP}.dim_activity_full partition(dt='$do_date')
select
    rule.id,
    info.id,
    activity_name,
    rule.activity_type,
    dic.dic_name,
    activity_desc,
    start_time,
    end_time,
    create_time,
    condition_amount,
    condition_num,
    benefit_amount,
    benefit_discount,
    case rule.activity_type
        when '3101' then concat('满',condition_amount,'元减',benefit_amount,'元')
        when '3102' then concat('满',condition_num,'件打',10*(1-benefit_discount),'折')
        when '3103' then concat('打',10*(1-benefit_discount),'折')
    end benefit_rule,
    benefit_level
from
(
    select
        id,
        activity_id,
        activity_type,
        condition_amount,
        condition_num,
        benefit_amount,
        benefit_discount,
        benefit_level
    from ${APP}.ods_activity_rule_full
    where dt='$do_date'
)rule
left join
(
    select
        id,
        activity_name,
        activity_type,
        activity_desc,
        start_time,
        end_time,
        create_time
    from ${APP}.ods_activity_info_full
    where dt='$do_date'
)info
on rule.activity_id=info.id
left join
(
    select
        dic_code,
        dic_name
    from ${APP}.ods_base_dic_full
    where dt='$do_date'
    and parent_code='31'
)dic
on rule.activity_type=dic.dic_code;
"

case $1 in
"dim_user_zip")
    hive -e "$dim_user_zip"
;;
"dim_sku_full")
    hive -e "$dim_sku_full"
;;
"dim_province_full")
    hive -e "$dim_province_full"
;;
"dim_coupon_full")
    hive -e "$dim_coupon_full"
;;
"dim_activity_full")
    hive -e "$dim_activity_full"
;;
"all")
    hive -e "$dim_user_zip$dim_sku_full$dim_province_full$dim_coupon_full$dim_activity_full"
;;
esac

脚本执行权限

chmod 777 ods_to_dim_init.sh

数据仓库表名规范 数据仓库dim_hadoop_10

脚本用法

ods_to_dim_init.sh all 2020-06-14

每日装载脚本

在 cpu101 的 /home/cpu/bin 目录下创建 ods_to_dim.sh

vim ods_to_dim.sh

内容 :

#!/bin/bash

APP=gmall

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$2" ] ;then
    do_date=$2
else 
    do_date=`date -d "-1 day" +%F`
fi

dim_user_zip="
set hive.exec.dynamic.partition.mode=nonstrict;
with
tmp as
(
    select
        old.id old_id,
        old.login_name old_login_name,
        old.nick_name old_nick_name,
        old.name old_name,
        old.phone_num old_phone_num,
        old.email old_email,
        old.user_level old_user_level,
        old.birthday old_birthday,
        old.gender old_gender,
        old.create_time old_create_time,
        old.operate_time old_operate_time,
        old.start_date old_start_date,
        old.end_date old_end_date,
        new.id new_id,
        new.login_name new_login_name,
        new.nick_name new_nick_name,
        new.name new_name,
        new.phone_num new_phone_num,
        new.email new_email,
        new.user_level new_user_level,
        new.birthday new_birthday,
        new.gender new_gender,
        new.create_time new_create_time,
        new.operate_time new_operate_time,
        new.start_date new_start_date,
        new.end_date new_end_date
    from
    (
        select
            id,
            login_name,
            nick_name,
            name,
            phone_num,
            email,
            user_level,
            birthday,
            gender,
            create_time,
            operate_time,
            start_date,
            end_date
        from ${APP}.dim_user_zip
        where dt='9999-12-31'
    )old
    full outer join
    (
        select
            id,
            login_name,
            nick_name,
            md5(name) name,
            md5(phone_num) phone_num,
            md5(email) email,
            user_level,
            birthday,
            gender,
            create_time,
            operate_time,
            '$do_date' start_date,
            '9999-12-31' end_date
        from
        (
            select
                data.id,
                data.login_name,
                data.nick_name,
                data.name,
                data.phone_num,
                data.email,
                data.user_level,
                data.birthday,
                data.gender,
                data.create_time,
                data.operate_time,
                row_number() over (partition by data.id order by ts desc) rn
            from ${APP}.ods_user_info_inc
            where dt='$do_date'
        )t1
        where rn=1
    )new
    on old.id=new.id
)
insert overwrite table ${APP}.dim_user_zip partition(dt)
select
    if(new_id is not null,new_id,old_id),
    if(new_id is not null,new_login_name,old_login_name),
    if(new_id is not null,new_nick_name,old_nick_name),
    if(new_id is not null,new_name,old_name),
    if(new_id is not null,new_phone_num,old_phone_num),
    if(new_id is not null,new_email,old_email),
    if(new_id is not null,new_user_level,old_user_level),
    if(new_id is not null,new_birthday,old_birthday),
    if(new_id is not null,new_gender,old_gender),
    if(new_id is not null,new_create_time,old_create_time),
    if(new_id is not null,new_operate_time,old_operate_time),
    if(new_id is not null,new_start_date,old_start_date),
    if(new_id is not null,new_end_date,old_end_date),
    if(new_id is not null,new_end_date,old_end_date) dt
from tmp
union all
select
    old_id,
    old_login_name,
    old_nick_name,
    old_name,
    old_phone_num,
    old_email,
    old_user_level,
    old_birthday,
    old_gender,
    old_create_time,
    old_operate_time,
    old_start_date,
    cast(date_add('$do_date',-1) as string) old_end_date,
    cast(date_add('$do_date',-1) as string) dt
from tmp
where old_id is not null
and new_id is not null;
"

dim_sku_full="
with
sku as
(
    select
        id,
        price,
        sku_name,
        sku_desc,
        weight,
        is_sale,
        spu_id,
        category3_id,
        tm_id,
        create_time
    from ${APP}.ods_sku_info_full
    where dt='$do_date'
),
spu as
(
    select
        id,
        spu_name
    from ${APP}.ods_spu_info_full
    where dt='$do_date'
),
c3 as
(
    select
        id,
        name,
        category2_id
    from ${APP}.ods_base_category3_full
    where dt='$do_date'
),
c2 as
(
    select
        id,
        name,
        category1_id
    from ${APP}.ods_base_category2_full
    where dt='$do_date'
),
c1 as
(
    select
        id,
        name
    from ${APP}.ods_base_category1_full
    where dt='$do_date'
),
tm as
(
    select
        id,
        tm_name
    from ${APP}.ods_base_trademark_full
    where dt='$do_date'
),
attr as
(
    select
        sku_id,
        collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
    from ${APP}.ods_sku_attr_value_full
    where dt='$do_date'
    group by sku_id
),
sale_attr as
(
    select
        sku_id,
        collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
    from ${APP}.ods_sku_sale_attr_value_full
    where dt='$do_date'
    group by sku_id
)
insert overwrite table ${APP}.dim_sku_full partition(dt='$do_date')
select
    sku.id,
    sku.price,
    sku.sku_name,
    sku.sku_desc,
    sku.weight,
    sku.is_sale,
    sku.spu_id,
    spu.spu_name,
    sku.category3_id,
    c3.name,
    c3.category2_id,
    c2.name,
    c2.category1_id,
    c1.name,
    sku.tm_id,
    tm.tm_name,
    attr.attrs,
    sale_attr.sale_attrs,
    sku.create_time
from sku
left join spu on sku.spu_id=spu.id
left join c3 on sku.category3_id=c3.id
left join c2 on c3.category2_id=c2.id
left join c1 on c2.category1_id=c1.id
left join tm on sku.tm_id=tm.id
left join attr on sku.id=attr.sku_id
left join sale_attr on sku.id=sale_attr.sku_id;
"

dim_province_full="
insert overwrite table ${APP}.dim_province_full partition(dt='$do_date')
select
    province.id,
    province.name,
    province.area_code,
    province.iso_code,
    province.iso_3166_2,
    region_id,
    region_name
from
(
    select
        id,
        name,
        region_id,
        area_code,
        iso_code,
        iso_3166_2
    from ${APP}.ods_base_province_full
    where dt='$do_date'
)province
left join
(
    select
        id,
        region_name
    from ${APP}.ods_base_region_full
    where dt='$do_date'
)region
on province.region_id=region.id;
"

dim_coupon_full="
insert overwrite table ${APP}.dim_coupon_full partition(dt='$do_date')
select
    id,
    coupon_name,
    coupon_type,
    coupon_dic.dic_name,
    condition_amount,
    condition_num,
    activity_id,
    benefit_amount,
    benefit_discount,
    case coupon_type
        when '3201' then concat('满',condition_amount,'元减',benefit_amount,'元')
        when '3202' then concat('满',condition_num,'件打',10*(1-benefit_discount),'折')
        when '3203' then concat('减',benefit_amount,'元')
    end benefit_rule,
    create_time,
    range_type,
    range_dic.dic_name,
    limit_num,
    taken_count,
    start_time,
    end_time,
    operate_time,
    expire_time
from
(
    select
        id,
        coupon_name,
        coupon_type,
        condition_amount,
        condition_num,
        activity_id,
        benefit_amount,
        benefit_discount,
        create_time,
        range_type,
        limit_num,
        taken_count,
        start_time,
        end_time,
        operate_time,
        expire_time
    from ${APP}.ods_coupon_info_full
    where dt='$do_date'
)ci
left join
(
    select
        dic_code,
        dic_name
    from ${APP}.ods_base_dic_full
    where dt='$do_date'
    and parent_code='32'
)coupon_dic
on ci.coupon_type=coupon_dic.dic_code
left join
(
    select
        dic_code,
        dic_name
    from ${APP}.ods_base_dic_full
    where dt='$do_date'
    and parent_code='33'
)range_dic
on ci.range_type=range_dic.dic_code;
"

dim_activity_full="
insert overwrite table ${APP}.dim_activity_full partition(dt='$do_date')
select
    rule.id,
    info.id,
    activity_name,
    rule.activity_type,
    dic.dic_name,
    activity_desc,
    start_time,
    end_time,
    create_time,
    condition_amount,
    condition_num,
    benefit_amount,
    benefit_discount,
    case rule.activity_type
        when '3101' then concat('满',condition_amount,'元减',benefit_amount,'元')
        when '3102' then concat('满',condition_num,'件打',10*(1-benefit_discount),'折')
        when '3103' then concat('打',10*(1-benefit_discount),'折')
    end benefit_rule,
    benefit_level
from
(
    select
        id,
        activity_id,
        activity_type,
        condition_amount,
        condition_num,
        benefit_amount,
        benefit_discount,
        benefit_level
    from ${APP}.ods_activity_rule_full
    where dt='$do_date'
)rule
left join
(
    select
        id,
        activity_name,
        activity_type,
        activity_desc,
        start_time,
        end_time,
        create_time
    from ${APP}.ods_activity_info_full
    where dt='$do_date'
)info
on rule.activity_id = info.id
left join
(
    select
        dic_code,
        dic_name
    from ${APP}.ods_base_dic_full
    where dt='$do_date'
    and parent_code='31'
)dic
on rule.activity_type = dic.dic_code;
"

case $1 in
"dim_user_zip")
    hive -e "$dim_user_zip"
;;
"dim_sku_full")
    hive -e "$dim_sku_full"
;;
"dim_province_full")
    hive -e "$dim_province_full"
;;
"dim_coupon_full")
    hive -e "$dim_coupon_full"
;;
"dim_activity_full")
    hive -e "$dim_activity_full"
;;
"all")
    hive -e "$dim_user_zip$dim_sku_full$dim_province_full$dim_coupon_full$dim_activity_full"
;;
esac

数据仓库表名规范 数据仓库dim_big data_11

脚本执行权限 :

chmod 777 ods_to_dim.sh

数据仓库表名规范 数据仓库dim_hdfs_12

脚本用法 :

ods_to_dim.sh all 2020-06-14