概括
一个数据分析、离线跑批的一种画像。数据来源信息以及交易数据,进行统计分析为出指导性策略意见。分别有三个数据库在doris里:dim提供数据字典以及翻译,dwd是原数据从数仓抽取数据存放,tdm是清洗过后的数据供打标签使用。
从数仓抽取数据进行保存,针对数据进行加工清洗,对用户的业务数据完成定义标签跑批,机构如下图:
数据开发过程
总体分为三步:
1、 抽取数据例如: (需要对dolphinscheduler、datax、doris进行了解)
dolphinscheduler使用datax进行抽取数据到doris数据库中,作为加工元数据存储。
{
"job": {
"content": [
{
"reader": {
"name": "postgresqlreader",
"parameter": {
"connection": [
{
"jdbcUrl": [
"jdbc:postgresql://${gp_ip}:${gp_port}/appwarehouse2olap"
],
"querySql": [
"select corpid_dwh,data_date,load_date,client_no,address_type,contact_form,address,contact_tel,postal_code,country_code,prov_code,city_code,mobile_phone,email,pref_flag,contacts_name,contacts_tel,update_date,update_teller_no from dwds.ds_cif_address_info where corpid_dwh = '${corpid_dwh}' "
]
}
],
"password": "${gp_password}",
"username": "${gp_admin}",
}
},
"writer": {
"name": "doriswriter",
"parameter": {
"loadUrl": ["${ip}:8030"],
"loadProps": {
},
"column": ["corpid_dwh","data_date","load_date","client_no","address_type","contact_form","address","contact_tel","postal_code","country_code","prov_code","city_code","mobile_phone","email","pref_flag","contacts_name","contacts_tel","update_date","update_teller_no"],
"username": "root",
"password": "",
"postSql": ["select count(1) from dwd_outside_ds_cif_address_info"],
"preSql": [],
"flushInterval":30000,
"connection": [
{
"jdbcUrl": "jdbc:mysql://${ip}:9030/dwd",
"selectedDatabase": "dwd",
"table": ["dwd_outside_ds_cif_address_info"]
}
],
"loadProps": {
"format": "json",
"strip_outer_array": true
}
}
}
}
],
"setting": {
"speed": {
"channel": "5"
}
}
}
}
2、 数据清洗
根据元数据结构进行加工清洗到tdm
insert into tdm.tdm_institution_relation_info
select
t2.client_name as main_body_name ,
t1.relation_name as stakeholders,
-- t1.relation_type as relation ,
t3.property as relation ,
str_to_date(t1.data_date , '%Y%m%d') as relate_date,
'对公' as `type` ,
t1.corpid_dwh as tenant_id,
--- 目前原始数据就缺 机构字段 open_branch_no
t4.branch_name as create_branch
from (select * from dwd.dwd_outside_ds_cif_corp_relation where corpid_dwh = ${corpid_dwh}) t1
left join (select * from dwd.dwd_outside_ds_cif_corp_info where corpid_dwh = ${corpid_dwh}) t2
on nvl(t1.client_no, 100* rand()) = t2.client_no
left join dim.dim_outside_relation_type_dict t3
on nvl(t1.relation_type, 100* rand() ) = t3.id
left join (
select
t3.branch_name ,
t4.client_no
from dim.dim_outside_o_enszhongning_fm_branch_tbl t3
left join (select * from dwd.dwd_outside_ds_cif_corp_info where corpid_dwh = ${corpid_dwh})t4
on t3.branch = t4.open_branch_no
) t4 on t1.client_no = t4.client_no
where t2.client_name is not null and t1.relation_name is not null and t1.relation_name !='' and t2.client_name != '';
3、 原子标签制作
插入label_base标签定义,归属分类
INSERT INTO `dm_label`.`label_base`(`LABEL_CODE`, `LABEL_NAME`, `CATEGORY_ID`, `LABEL_TYPE`, `SERVICE_DESCRIPTION`, `LABEL_STATE`, `LABEL_VALUE_TYPE`, `CALCULATION_METHOD`, `LABEL_VALUE_DOMAIN_DESCRIPTION`, `LABEL_COMPANY`, `MASKING_FLAG`, `CREATION_METHOD`, `START_TIME`, `UPDATE_RULE`, `UPDATE_FREQUENCY`, `LABEL_RUN_TIME`, `ASCRIPTION_ADMINISTRATION`, `ASCRIPTION_DEPARTMENT`, `APPROVER`, `LABEL_ENTITY_ID`, `LB_EXPRESSION`, `LABEL_RULE_SYMBOL`, `TENANT_ID`, `DEL_FLAG`, `CREATOR`, `CREATE_BRANCH`, `CREATE_TIME`, `LAST_UPDATE_USER`, `LAST_UPDATE_TIME`, `PUBLIC_STATUS`) VALUES ('F100U0182', '5年便民储蓄-125', 'X100U', '离线标签', '客户持有5年便民储蓄产品情况', '4', 'string', '离线计算', '客户持有5年便民储蓄产品情况', '', '0', 'base', '0', 'auto', '0', '', '', '', '', '1', '', '', 'XXZY', '0', '标签系统账号', '498199', '2023-08-30 10:42', '标签系统账号', '2023-08-30 10:42', '0');
标签加工过程LABEL_CODE关联label_base标签定义
INSERT INTO `dm_label`.`label_calculation_grade`(`LABEL_CODE`, `LABEL_COUNT_NO`, `LABEL_PROCESS_SQL`, `LABEL_RULE_ID`, `LABEL_RULE_RESULT`, `RULE_VALUE_CODE`, `PROCESS_DATE`, `TENANT_ID`, `DEL_FLAG`, `CREATOR`, `CREATE_BRANCH`, `CREATE_TIME`, `LAST_UPDATE_USER`, `LAST_UPDATE_TIME`) VALUES ('F100U0182', '0', 'select customer_code, date_add( CURDATE(),-1 ) shift_date, tenant_id, create_branch_id,\'F100U0182\' as label_code, \'是\' as label_value ,\'\' label_company,\'boolean\' as label_value_type, \'1\' entity_id from tdm.tdm_custom_product where p_cur_5y_conv_peo_sav_125 = \'1\' and tenant_id IN (?)', '001821', '是', '是', '2023-08-29 13:47:10', '000000', '0', '标签系统账号', 'LB001', '2023-08-29 13:47:10', 'dmlabel', '2023-08-29 13:47:10');
标签加工使用sql加工从清洗的tdm的业务表中加工,‘?’ 用来做saas的替换法人信息,完成数据隔离
SELECT
customer_code,
date_add( CURDATE(),- 1 ) shift_date,
tenant_id,
create_branch_id,
'F100U0182' AS label_code,
'是' AS label_value,
'' label_company,
'boolean' AS label_value_type,
'1' entity_id
FROM
tdm.tdm_custom_product
WHERE
p_cur_5y_conv_peo_sav_125 = '1'
AND tenant_id IN (?)
核心逻辑开发
衍生标签制作模型
/** * 生成SQL1.1 * @param json * @param labelCode * @param labelCompany * @return */public static String Derived_Tag_Sql1(String json, String labelCode,String labelCompany, String labelValueType,String labelEntityId){ ExpressionGenerator2 generator = new ExpressionGenerator2(); Expression2 expression = generator.jsonToExpression(JSON.parseObject(json)); String strWhere = generator.expressionToWhere(expression); //sql生成,拼接多租户 strWhere = strWhere + " AND (tenant_id in (?)) "; String strMax = generator.expressionToMax(expression); String labelCodeWhere = generator.expressionToMaxLabel(expression); labelCodeWhere = labelCodeWhere.substring(0, labelCodeWhere.length()-1); //取出标签值,一个title对应一个标签值 String labelValue = (String) JSON.parseObject(json).get("title"); String sql = ""; String insert = "insert into tdm.tdm_label_history_info (customer_code,shift_date,tenant_id,create_branch,label_code,label_value,label_company,label_value_type,entity_id) "; sql = "select customer_code,shift_date,tenant_id,create_branch,'"+labelCode+"' " + " label_code,'"+labelValue+"' label_value,'"+labelCompany+"' label_company ,'"+ labelValueType + "' label_value_type,'"+ labelEntityId +"' entity_id from (select customer_code," + "date_add( CURDATE(),- 1 ) shift_date,tenant_id,create_branch,"+strMax+" from tdm.tdm_label_history_info where label_code in( " + labelCodeWhere + " ) group by " + "customer_code,tenant_id,create_branch) a" + " where "+ strWhere+" "; log.debug("解析【{}】后的SQL为【{}】",labelCode,sql); return sql;}
生成衍生标签的加工sql 用户标签存储在tdm.tdm_label_history_info。
public String expressionToWhere(Expression2 expression) { String labelCode = expression.getTag(); String labelCode2 = supp.get(whereIndex) + labelCode; whereIndex++; String op = expression.getSymbol(); String value = expression.getText(); String after = expression.getAfter(); String logic = expression.getLogic(); List<Expression2> params = expression.getArr(); StringBuilder builder = new StringBuilder(); int num = 0; if (null != labelCode && null != op && null != value) { num++; builder.append(labelCode2); builder.append(" "); builder.append(op); builder.append(" "); //先判断op是 = 还是 between if ("between".equals(op)) { if (value.contains("[")) { value = value.replace("[", "").replace("]", ""); } if (isNumeric(value)) { // "18","20" int t = Integer.parseInt(value); int t1 = Integer.parseInt(after); if (t > t1){ String t2 = value; value = after; after = t2; } builder.append(value).append(" and ").append(after); } else { value = value.replace("\"",""); String[] split = value.split(","); // "2023-07-21","2023-07-25" //日期,text1 > text2 则替换 SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd"); try { Date date1 = format.parse(split[0]); Date date2 = format.parse(split[1]); if (date1.after(date2)) { String s3 = split[0]; split[0] = split[1]; split[1] = s3; } } catch (ParseException e) { throw BizExceptionHandler.of("日期格式转换错误",e); } builder.append(split[0]).append(" and ").append(split[1]); } } else if(op.contains("null")){//是否为 is not null、is null }else { if (isNumeric(value)) { builder = new StringBuilder(builder.toString().replace("in", "=")); builder.append("" + value + ""); } else { String replace = ""; if (value.contains("[")) { replace = value.replace("[", "").replace("]", ""); } else { replace = "'" + value + "'"; } builder.append("("); String[] split = replace.split(","); for (int i = 0; i < split.length; i++) { if (i == 0) { builder.append(split[i]); } else { builder.append("," + split[i]); } } builder.append(")"); } } } // where (A0001 > 50) and (A00002 < 30) and (A00003 between 10 and 20) for (Expression2 param : params) { num++; if (num > 1) { builder.append(" "); builder.append(logic); builder.append(" "); } builder.append("("); builder.append(expressionToWhere(param)); builder.append(")"); } //return "(" + builder.toString() + ")"; return builder.toString();}
处理标签关系生成条件。
针对问题排查步骤
1、 确定哪一个标签出现问题:
例如:标签编号:A100U0047 ,本异地客户标志
在label_base中找到label_code等于A100U0047的标签,然后在label_calculation_grade中找到label_code等于A100U0047的所有加工逻辑,
INSERT INTO `dm_label`.`label_calculation_grade`(`LABEL_CODE`, `LABEL_COUNT_NO`, `LABEL_PROCESS_SQL`, `LABEL_RULE_ID`, `LABEL_RULE_RESULT`, `RULE_VALUE_CODE`, `PROCESS_DATE`, `TENANT_ID`, `DEL_FLAG`, `CREATOR`, `CREATE_BRANCH`, `CREATE_TIME`, `LAST_UPDATE_USER`, `LAST_UPDATE_TIME`) VALUES ('A100U0047', '0', 'select customer_code, date_add( CURDATE(),-1 ) shift_date, tenant_id, create_branch_id,\'A100U0047\' as label_code, \'本地\' as label_value ,\'\' label_company,\'string\' as label_value_type, \'1\' entity_id from tdm.tdm_customer_basic_info where is_local_cus_flg = \'01\' and tenant_id IN (?)', '000471', '本地', '本地', '2023-08-29 13:47:10', '000000', '0', '标签系统账号', 'LB001', '2023-08-29 13:47:10', 'dmlabel', '2023-08-29 13:47:10');
INSERT INTO `dm_label`.`label_calculation_grade`(`LABEL_CODE`, `LABEL_COUNT_NO`, `LABEL_PROCESS_SQL`, `LABEL_RULE_ID`, `LABEL_RULE_RESULT`, `RULE_VALUE_CODE`, `PROCESS_DATE`, `TENANT_ID`, `DEL_FLAG`, `CREATOR`, `CREATE_BRANCH`, `CREATE_TIME`, `LAST_UPDATE_USER`, `LAST_UPDATE_TIME`) VALUES ('A100U0047', '0', 'select customer_code, date_add( CURDATE(),-1 ) shift_date, tenant_id, create_branch_id,\'A100U0047\' as label_code, \'异地\' as label_value ,\'\' label_company,\'string\' as label_value_type, \'1\' entity_id from tdm.tdm_customer_basic_info where is_local_cus_flg = \'02\' and tenant_id IN (?)', '000472', '异地', '异地', '2023-08-29 13:47:10', '000000', '0', '标签系统账号', 'LB001', '2023-08-29 13:47:10', 'dmlabel', '2023-08-29 13:47:10');
加工逻辑例如:
SELECT
customer_code,
date_add( CURDATE(),- 1 ) shift_date,
tenant_id,
create_branch_id,
'A100U0047' AS label_code,
'本地' AS label_value,
'' label_company,
'string' AS label_value_type,
'1' entity_id
FROM
tdm.tdm_customer_basic_info
WHERE
is_local_cus_flg = '01'
AND tenant_id IN (?)
分析标签来源于那张业务表那个字段: tdm.tdm_customer_basic_info >is_local_cus_flg
查看加工逻辑是否正确,如果正确请查询数仓数据是否正确,进行问题点修改。
数仓数据模型变动修改
例如:
{
"job": {
"content": [
{
"reader": {
"name": "postgresqlreader",
"parameter": {
"connection": [
{
"jdbcUrl": [
"jdbc:postgresql://${gp_ip}:${gp_port}/appwarehouse2olap"
],
"querySql": [
"select corpid_dwh,data_date,load_date,client_no,address_type,contact_form,address,contact_tel,postal_code,country_code,prov_code,city_code,mobile_phone,email,pref_flag,contacts_name,contacts_tel,update_date,update_teller_no from dwds.ds_cif_address_info where corpid_dwh = '${corpid_dwh}' "
]
}
],
"password": "${gp_password}",
"username": "${gp_admin}",
}
},
"writer": {
"name": "doriswriter",
"parameter": {
"loadUrl": ["${ip}:8030"],
"loadProps": {
},
"column": ["corpid_dwh","data_date","load_date","client_no","address_type","contact_form","address","contact_tel","postal_code","country_code","prov_code","city_code","mobile_phone","email","pref_flag","contacts_name","contacts_tel","update_date","update_teller_no"],
"username": "root",
"password": "",
"postSql": ["select count(1) from dwd_outside_ds_cif_address_info"],
"preSql": [],
"flushInterval":30000,
"connection": [
{
"jdbcUrl": "jdbc:mysql://${ip}:9030/dwd",
"selectedDatabase": "dwd",
"table": ["dwd_outside_ds_cif_address_info"]
}
],
"loadProps": {
"format": "json",
"strip_outer_array": true
}
}
}
}
],
"setting": {
"speed": {
"channel": "5"
}
}
}
}
数仓ds_cif_address_info对应dwd. dwd_outside_ds_cif_address_info针对字段添加变更进行sql语句维护。以及针对下游tdm对应表的字段添加变更进行sql语句维护。使用修改添加字段、修改字段属性的SQL语句。