概括

一个数据分析、离线跑批的一种画像。数据来源信息以及交易数据,进行统计分析为出指导性策略意见。分别有三个数据库在doris里:dim提供数据字典以及翻译,dwd是原数据从数仓抽取数据存放,tdm是清洗过后的数据供打标签使用。

从数仓抽取数据进行保存,针对数据进行加工清洗,对用户的业务数据完成定义标签跑批,机构如下图:


数据开发过程

总体分为三步:

1、 抽取数据例如: (需要对dolphinscheduler、datax、doris进行了解)

dolphinscheduler使用datax进行抽取数据到doris数据库中,作为加工元数据存储。

{

    "job": {

        "content": [

            {

                "reader": {

                    "name": "postgresqlreader",

                    "parameter": {

                        "connection": [

                            {

                                "jdbcUrl": [

        "jdbc:postgresql://${gp_ip}:${gp_port}/appwarehouse2olap"

        ],

                                "querySql": [

        "select corpid_dwh,data_date,load_date,client_no,address_type,contact_form,address,contact_tel,postal_code,country_code,prov_code,city_code,mobile_phone,email,pref_flag,contacts_name,contacts_tel,update_date,update_teller_no from dwds.ds_cif_address_info where corpid_dwh = '${corpid_dwh}' "

        ]

                            }

                        ],

                        "password": "${gp_password}",

                        "username": "${gp_admin}",

                    }

                },

                "writer": {

                    "name": "doriswriter",

                    "parameter": {

                        "loadUrl": ["${ip}:8030"],

                        "loadProps": {

                        },

                        "column": ["corpid_dwh","data_date","load_date","client_no","address_type","contact_form","address","contact_tel","postal_code","country_code","prov_code","city_code","mobile_phone","email","pref_flag","contacts_name","contacts_tel","update_date","update_teller_no"],

                        "username": "root",

                        "password": "",

                        "postSql": ["select count(1) from dwd_outside_ds_cif_address_info"],

                        "preSql": [],

                        "flushInterval":30000,

                        "connection": [

                          {

                            "jdbcUrl": "jdbc:mysql://${ip}:9030/dwd",

                            "selectedDatabase": "dwd",

                            "table": ["dwd_outside_ds_cif_address_info"]

                          }

                        ],

                        "loadProps": {

                            "format": "json",

                            "strip_outer_array": true

                        }

                    }

                }


            }

        ],

        "setting": {

            "speed": {  

                "channel": "5"

            }

        }

    }

}


2、 数据清洗

根据元数据结构进行加工清洗到tdm

insert into tdm.tdm_institution_relation_info

select

  t2.client_name  as main_body_name ,

  t1.relation_name as stakeholders,

  -- t1.relation_type as relation ,

  t3.property as relation ,

  str_to_date(t1.data_date , '%Y%m%d') as relate_date,

  '对公' as `type` ,

  t1.corpid_dwh as tenant_id,

 --- 目前原始数据就缺 机构字段   open_branch_no

  t4.branch_name as create_branch

from (select * from dwd.dwd_outside_ds_cif_corp_relation where corpid_dwh = ${corpid_dwh}) t1

left join (select * from dwd.dwd_outside_ds_cif_corp_info where corpid_dwh = ${corpid_dwh}) t2

on nvl(t1.client_no, 100* rand()) = t2.client_no


left join dim.dim_outside_relation_type_dict t3

on nvl(t1.relation_type, 100* rand() ) = t3.id


left join (


  select 

   t3.branch_name ,

   t4.client_no


  from dim.dim_outside_o_enszhongning_fm_branch_tbl t3

  left join (select * from dwd.dwd_outside_ds_cif_corp_info where corpid_dwh = ${corpid_dwh})t4

  on t3.branch = t4.open_branch_no  

) t4 on t1.client_no = t4.client_no

where t2.client_name is not null and t1.relation_name is not null and t1.relation_name !='' and t2.client_name != '';


3、 原子标签制作

插入label_base标签定义,归属分类

INSERT INTO `dm_label`.`label_base`(`LABEL_CODE`, `LABEL_NAME`, `CATEGORY_ID`, `LABEL_TYPE`, `SERVICE_DESCRIPTION`, `LABEL_STATE`, `LABEL_VALUE_TYPE`, `CALCULATION_METHOD`, `LABEL_VALUE_DOMAIN_DESCRIPTION`, `LABEL_COMPANY`, `MASKING_FLAG`, `CREATION_METHOD`, `START_TIME`, `UPDATE_RULE`, `UPDATE_FREQUENCY`, `LABEL_RUN_TIME`, `ASCRIPTION_ADMINISTRATION`, `ASCRIPTION_DEPARTMENT`, `APPROVER`, `LABEL_ENTITY_ID`, `LB_EXPRESSION`, `LABEL_RULE_SYMBOL`, `TENANT_ID`, `DEL_FLAG`, `CREATOR`, `CREATE_BRANCH`, `CREATE_TIME`, `LAST_UPDATE_USER`, `LAST_UPDATE_TIME`, `PUBLIC_STATUS`) VALUES ('F100U0182', '5年便民储蓄-125', 'X100U', '离线标签', '客户持有5年便民储蓄产品情况', '4', 'string', '离线计算', '客户持有5年便民储蓄产品情况', '', '0', 'base', '0', 'auto', '0', '', '', '', '', '1', '', '', 'XXZY', '0', '标签系统账号', '498199', '2023-08-30 10:42', '标签系统账号', '2023-08-30 10:42', '0');

标签加工过程LABEL_CODE关联label_base标签定义

INSERT INTO `dm_label`.`label_calculation_grade`(`LABEL_CODE`, `LABEL_COUNT_NO`, `LABEL_PROCESS_SQL`, `LABEL_RULE_ID`, `LABEL_RULE_RESULT`, `RULE_VALUE_CODE`, `PROCESS_DATE`, `TENANT_ID`, `DEL_FLAG`, `CREATOR`, `CREATE_BRANCH`, `CREATE_TIME`, `LAST_UPDATE_USER`, `LAST_UPDATE_TIME`) VALUES ('F100U0182', '0', 'select customer_code, date_add( CURDATE(),-1 ) shift_date, tenant_id, create_branch_id,\'F100U0182\' as label_code, \'是\' as label_value ,\'\' label_company,\'boolean\' as label_value_type, \'1\' entity_id from tdm.tdm_custom_product where p_cur_5y_conv_peo_sav_125 = \'1\' and tenant_id IN (?)', '001821', '是', '是', '2023-08-29 13:47:10', '000000', '0', '标签系统账号', 'LB001', '2023-08-29 13:47:10', 'dmlabel', '2023-08-29 13:47:10');


标签加工使用sql加工从清洗的tdm的业务表中加工,‘?’ 用来做saas的替换法人信息,完成数据隔离

SELECT

customer_code,

date_add( CURDATE(),- 1 ) shift_date,

tenant_id,

create_branch_id,

'F100U0182' AS label_code,

'是' AS label_value,

'' label_company,

'boolean' AS label_value_type,

'1' entity_id

FROM

tdm.tdm_custom_product

WHERE

p_cur_5y_conv_peo_sav_125 = '1'

AND tenant_id IN (?)

核心逻辑开发

衍生标签制作模型


/** * 生成SQL1.1 * @param json * @param labelCode * @param labelCompany * @return */public static String  Derived_Tag_Sql1(String json, String labelCode,String  labelCompany,                                       String labelValueType,String labelEntityId){    ExpressionGenerator2 generator = new ExpressionGenerator2();    Expression2 expression = generator.jsonToExpression(JSON.parseObject(json));    String strWhere = generator.expressionToWhere(expression);    //sql生成,拼接多租户    strWhere = strWhere + " AND  (tenant_id in (?)) ";    String strMax = generator.expressionToMax(expression);    String labelCodeWhere = generator.expressionToMaxLabel(expression);    labelCodeWhere = labelCodeWhere.substring(0, labelCodeWhere.length()-1);    //取出标签值,一个title对应一个标签值    String labelValue = (String) JSON.parseObject(json).get("title");    String sql = "";    String insert  = "insert  into  tdm.tdm_label_history_info (customer_code,shift_date,tenant_id,create_branch,label_code,label_value,label_company,label_value_type,entity_id)      ";    sql = "select   customer_code,shift_date,tenant_id,create_branch,'"+labelCode+"' " +            " label_code,'"+labelValue+"'  label_value,'"+labelCompany+"'   label_company ,'"+ labelValueType +            "' label_value_type,'"+ labelEntityId +"' entity_id  from  (select  customer_code," +            "date_add( CURDATE(),- 1 )  shift_date,tenant_id,create_branch,"+strMax+"  from  tdm.tdm_label_history_info where label_code in( " + labelCodeWhere + " ) group  by " +            "customer_code,tenant_id,create_branch) a" +            "  where  "+ strWhere+" ";    log.debug("解析【{}】后的SQL为【{}】",labelCode,sql);    return sql;}


生成衍生标签的加工sql 用户标签存储在tdm.tdm_label_history_info。


public String expressionToWhere(Expression2 expression) {    String labelCode = expression.getTag();    String labelCode2 = supp.get(whereIndex) + labelCode;    whereIndex++;    String op = expression.getSymbol();    String value = expression.getText();    String after = expression.getAfter();    String logic = expression.getLogic();    List<Expression2> params = expression.getArr();    StringBuilder builder = new StringBuilder();    int num = 0;    if (null != labelCode && null != op && null != value) {        num++;        builder.append(labelCode2);        builder.append(" ");        builder.append(op);        builder.append(" ");        //先判断op是 = 还是 between        if ("between".equals(op)) {            if (value.contains("[")) {                value = value.replace("[", "").replace("]", "");            }            if (isNumeric(value)) {                // "18","20"                int t = Integer.parseInt(value);                int t1 = Integer.parseInt(after);                if (t > t1){                    String t2 = value;                    value = after;                    after = t2;                }                builder.append(value).append(" and ").append(after);            } else {                value = value.replace("\"","");                String[] split = value.split(",");                // "2023-07-21","2023-07-25"                //日期,text1 > text2 则替换                SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");                try {                    Date date1 = format.parse(split[0]);                    Date date2 = format.parse(split[1]);                    if (date1.after(date2)) {                        String s3 = split[0];                        split[0] = split[1];                        split[1] = s3;                    }                } catch (ParseException e) {                    throw BizExceptionHandler.of("日期格式转换错误",e);                }                builder.append(split[0]).append(" and ").append(split[1]);            }        } else if(op.contains("null")){//是否为 is not null、is null        }else {            if (isNumeric(value)) {                builder = new StringBuilder(builder.toString().replace("in", "="));                builder.append("" + value + "");            } else {                String replace = "";                if (value.contains("[")) {                    replace = value.replace("[", "").replace("]", "");                } else {                    replace = "'" + value + "'";                }                builder.append("(");                String[] split = replace.split(",");                for (int i = 0; i < split.length; i++) {                    if (i == 0) {                        builder.append(split[i]);                    } else {                        builder.append("," + split[i]);                    }                }                builder.append(")");            }        }    }    // where (A0001 > 50)  and  (A00002 < 30) and (A00003 between 10 and 20)    for (Expression2 param : params) {        num++;        if (num > 1) {            builder.append(" ");            builder.append(logic);            builder.append(" ");        }        builder.append("(");        builder.append(expressionToWhere(param));        builder.append(")");    }    //return "(" + builder.toString() + ")";    return builder.toString();}

处理标签关系生成条件。

针对问题排查步骤

1、 确定哪一个标签出现问题:

例如:标签编号:A100U0047 ,本异地客户标志

在label_base中找到label_code等于A100U0047的标签,然后在label_calculation_grade中找到label_code等于A100U0047的所有加工逻辑,

INSERT INTO `dm_label`.`label_calculation_grade`(`LABEL_CODE`, `LABEL_COUNT_NO`, `LABEL_PROCESS_SQL`, `LABEL_RULE_ID`, `LABEL_RULE_RESULT`, `RULE_VALUE_CODE`, `PROCESS_DATE`, `TENANT_ID`, `DEL_FLAG`, `CREATOR`, `CREATE_BRANCH`, `CREATE_TIME`, `LAST_UPDATE_USER`, `LAST_UPDATE_TIME`) VALUES ('A100U0047', '0', 'select customer_code, date_add( CURDATE(),-1 ) shift_date, tenant_id, create_branch_id,\'A100U0047\' as label_code, \'本地\' as label_value ,\'\' label_company,\'string\' as label_value_type, \'1\' entity_id from tdm.tdm_customer_basic_info where is_local_cus_flg = \'01\' and tenant_id IN (?)', '000471', '本地', '本地', '2023-08-29 13:47:10', '000000', '0', '标签系统账号', 'LB001', '2023-08-29 13:47:10', 'dmlabel', '2023-08-29 13:47:10');

INSERT INTO `dm_label`.`label_calculation_grade`(`LABEL_CODE`, `LABEL_COUNT_NO`, `LABEL_PROCESS_SQL`, `LABEL_RULE_ID`, `LABEL_RULE_RESULT`, `RULE_VALUE_CODE`, `PROCESS_DATE`, `TENANT_ID`, `DEL_FLAG`, `CREATOR`, `CREATE_BRANCH`, `CREATE_TIME`, `LAST_UPDATE_USER`, `LAST_UPDATE_TIME`) VALUES ('A100U0047', '0', 'select customer_code, date_add( CURDATE(),-1 ) shift_date, tenant_id, create_branch_id,\'A100U0047\' as label_code, \'异地\' as label_value ,\'\' label_company,\'string\' as label_value_type, \'1\' entity_id from tdm.tdm_customer_basic_info where is_local_cus_flg = \'02\' and tenant_id IN (?)', '000472', '异地', '异地', '2023-08-29 13:47:10', '000000', '0', '标签系统账号', 'LB001', '2023-08-29 13:47:10', 'dmlabel', '2023-08-29 13:47:10');

加工逻辑例如:

SELECT

customer_code,

date_add( CURDATE(),- 1 ) shift_date,

tenant_id,

create_branch_id,

'A100U0047' AS label_code,

'本地' AS label_value,

'' label_company,

'string' AS label_value_type,

'1' entity_id

FROM

tdm.tdm_customer_basic_info

WHERE

is_local_cus_flg = '01'

AND tenant_id IN (?)

分析标签来源于那张业务表那个字段: tdm.tdm_customer_basic_info >is_local_cus_flg

查看加工逻辑是否正确,如果正确请查询数仓数据是否正确,进行问题点修改。


数仓数据模型变动修改

例如:

{

    "job": {

        "content": [

            {

                "reader": {

                    "name": "postgresqlreader",

                    "parameter": {

                        "connection": [

                            {

                                "jdbcUrl": [

        "jdbc:postgresql://${gp_ip}:${gp_port}/appwarehouse2olap"

        ],

                                "querySql": [

        "select corpid_dwh,data_date,load_date,client_no,address_type,contact_form,address,contact_tel,postal_code,country_code,prov_code,city_code,mobile_phone,email,pref_flag,contacts_name,contacts_tel,update_date,update_teller_no from dwds.ds_cif_address_info where corpid_dwh = '${corpid_dwh}' "

        ]

                            }

                        ],

                        "password": "${gp_password}",

                        "username": "${gp_admin}",

                    }

                },

                "writer": {

                    "name": "doriswriter",

                    "parameter": {

                        "loadUrl": ["${ip}:8030"],

                        "loadProps": {

                        },

                        "column": ["corpid_dwh","data_date","load_date","client_no","address_type","contact_form","address","contact_tel","postal_code","country_code","prov_code","city_code","mobile_phone","email","pref_flag","contacts_name","contacts_tel","update_date","update_teller_no"],

                        "username": "root",

                        "password": "",

                        "postSql": ["select count(1) from dwd_outside_ds_cif_address_info"],

                        "preSql": [],

                        "flushInterval":30000,

                        "connection": [

                          {

                            "jdbcUrl": "jdbc:mysql://${ip}:9030/dwd",

                            "selectedDatabase": "dwd",

                            "table": ["dwd_outside_ds_cif_address_info"]

                          }

                        ],

                        "loadProps": {

                            "format": "json",

                            "strip_outer_array": true

                        }

                    }

                }


            }

        ],

        "setting": {

            "speed": {  

                "channel": "5"

            }

        }

    }

}



数仓ds_cif_address_info对应dwd. dwd_outside_ds_cif_address_info针对字段添加变更进行sql语句维护。以及针对下游tdm对应表的字段添加变更进行sql语句维护。使用修改添加字段、修改字段属性的SQL语句。