mysql表结构迁移
- 先查询mysql中需要迁移表所涉及的所有字段类型;
--查询tjbb模式下所有表的所有字段类型
select
distinct data_type
from information_schema.columns
where table_schema = 'tjbb'
;
- 查询mysql与postgresql字段类型映射关系;
Mysql :: MySQL Workbench Manual :: 10.6.4 PostgreSQL Type Mapping
- 拼接gp建表语句;
--先设置group_concat的最大长度
SET SESSION group_concat_max_len=102400;
--拼接tjbb模式下所有表对应gp的ddl语句
select table_name,table_schema,
concat('create table ', concat('"',table_schema,'"'),'.', concat('"',table_name,'"'), ' (
', GROUP_CONCAT(
case when is_nullable = 'NO' then concat(concat('"',lower(column_name),'"'), ' ',
case when data_type = 'int' then data_type
when data_type = 'varchar' then replace (column_type,'varchar(0)','varchar(1)')
when data_type = 'char' then column_type
when data_type = 'date' then column_type
when data_type = 'datetime' then replace (column_type, data_type, 'timestamp')
when data_type = 'timestamp' then column_type
when data_type = 'bigint' then data_type
when data_type = 'double' then 'double precision'
when data_type = 'smallint' then 'smallint'
when data_type = 'decimal' then replace (column_type,'unsigned zerofill','')
when data_type = 'longtext' then 'text'
when data_type = 'text' then 'text'
when data_type = 'tinyint' then 'int'
when data_type = 'longblob' then 'bytea'
when data_type = 'blob' then 'bytea'
when data_type = 'float' then 'real'
when data_type = 'tinytext' then 'text'
when data_type = 'mediumtext' then 'text'
end, ' not null')
when is_nullable = 'YES' then concat(concat('"',lower(column_name),'"'), ' ',
case when data_type = 'int' then data_type
when data_type = 'varchar' then replace (column_type,'varchar(0)','varchar(1)')
when data_type = 'char' then column_type
when data_type = 'date' then column_type
when data_type = 'datetime' then replace (column_type, data_type, 'timestamp')
when data_type = 'timestamp' then column_type
when data_type = 'bigint' then data_type
when data_type = 'double' then 'double precision'
when data_type = 'smallint' then 'smallint'
when data_type = 'decimal' then replace (column_type,'unsigned zerofill','')
when data_type = 'longtext' then 'text'
when data_type = 'text' then 'text'
when data_type = 'tinyint' then 'int'
when data_type = 'longblob' then 'bytea'
when data_type = 'blob' then 'bytea'
when data_type = 'float' then 'real'
when data_type = 'tinytext' then 'text'
when data_type = 'mediumtext' then 'text'
end)
end order by ORDINAL_POSITION separator ',
'), '
) distributed by (',
case when group_concat(case when column_key = 'PRI' then concat('"',lower(column_name),'"') else null end order by ORDINAL_POSITION separator ',') is not null then group_concat(case when column_key = 'PRI' then concat('"',lower(column_name),'"') else null end order by ORDINAL_POSITION separator ',')
when group_concat(case when column_key = 'PRI' then concat('"',lower(column_name),'"') else null end order by ORDINAL_POSITION separator ',') is null and group_concat(case when column_key = 'UNI' then concat('"',lower(column_name),'"') else null end order by ORDINAL_POSITION separator ',') is not null then group_concat(case when column_key = 'UNI' then concat('"',lower(column_name),'"') else null end order by ORDINAL_POSITION separator ',')
when group_concat(case when column_key = 'PRI' then concat('"',lower(column_name),'"') else null end order by ORDINAL_POSITION separator ',') is null and group_concat(case when column_key = 'UNI' then concat('"',lower(column_name),'"') else null end order by ORDINAL_POSITION separator ',') is null then group_concat(case when ORDINAL_POSITION = 1 then concat('"',lower(column_name),'"') else null end order by ORDINAL_POSITION separator ',')
end, ') ;'
) psql_ddl
from information_schema.columns where table_schema = 'tjbb'
group by
table_name;
注:
①根据实际的字段类型,添加映射关系,添加到 case when 语句中;
②默认值中可能存在mysql中的关键字,所以忽略默认值;
③优先取表的主键作为gp表的分步键,若没有主键,则取唯一索引作为gp表的分步键,若再没有唯一索引,则取第一个字段作为gp表的分步键;
④默认mysql中的表、模式、数据库严格区分大小写,字段不区分大小写,所以,直接取数据字典中的表、模式映射到gp中加"",字段映射到gp中统一处理成小写并加上"";
- 拼接 table comment
--拼接tjbb模式下所有表对应gp的comment table语句
select
table_schema,
table_name,
concat('comment on table ', concat('"', table_schema, '"'), '.', concat('"', table_name, '"'), ' is ''', replace(table_comment, '''', ''''''), ''';') comment
from
information_schema.tables
where
table_comment != ''
and table_comment is not null
and table_schema = 'tjbb';
- 拼接 column comment
--拼接tjbb模式下所有表对应gp的comment column语句
select
table_schema,
table_name,
column_name,
concat('comment on column ', concat('"', table_schema, '"'), '.', concat('"', table_name, '"'), '.', concat('"', lower(column_name), '"'), ' is ''', replace(column_comment, '''', ''''''), ''';') comment
from
information_schema.columns
where
column_comment != ''
and column_comment is not null
and table_schema = 'tjbb';
oracle表结构迁移
直接使用 ora2pg 这个导表工具进行表结构的迁移;
Ora2pg安装与使用
创建相同的user和schema
在这里直接调用 createuser 存储过程来创建 tjbb 用户 和 tjbb schema;
select tool.createuser('tjbb','tjbb123456');
执行拼接语句
mysql:可以将上面拼接好的psql语句存在一个.psql文件中,通过psql客户端执行.psql文件;也可以复制接好的psql语句到gp的客户端中直接执行;到此,表结构已经全部建好;
oracle:直接使用psql执行ora2pg生成的postgresql脚本文件;
[grpcic@gr_vmuat_sjzt_gp01_041115 ~]$ psql -U tjbb -d etl -W -f tjbb_ddl.psql
在ods层批量建表(db_ods)
命名规则:mdb/odb_${schemaname}_${tablename}
mdb: mysql数据源
odb: oracle数据源
${schemaname}: 原模式名
${tablename}: 原表名
oracle (在db_ods下执行)
do $$
declare
i record;
begin
for i in (
select
pg_get_table_distributedby(b.oid) dis,
schemaname::varchar,
tablename::varchar,
'create table db_ods.odb_' || schemaname || '_' || tablename || ' (like ' || schemaname || '.' || tablename || ' including all) ' || pg_get_table_distributedby(b.oid) v_sql,
cast('grant select on table ' || schemaname || '.' || tablename || ' to db_ods' as varchar) v_grant
from
pg_tables a,
(
select
a.oid,
a.relname,
b.nspname
from
pg_class a,
pg_namespace b
where
a.relnamespace = b.oid
) b
where
a.schemaname = b.nspname
and a.tablename = b.relname
and schemaname = 'tjbb'
) loop
perform tool.sp_execsql(i.v_grant,i.schemaname);
execute i.v_sql;
end loop;
end$$;
Mysql(在tool用户下执行)
do $$
declare
i record;
begin
for i in (
select
pg_get_table_distributedby(b.oid) dis,
schemaname::varchar,
tablename::varchar,
'create table db_ods."mdb_' || schemaname || '_' || tablename || '" (like "' || schemaname || '"."' || tablename || '" including all) ' || pg_get_table_distributedby(b.oid) v_sql,
cast('grant select on table "' || schemaname || '"."' || tablename || '" to db_ods' as varchar) v_grant
from
pg_tables a,
(
select
a.oid,
a.relname,
b.nspname
from
pg_class a,
pg_namespace b
where
a.relnamespace = b.oid
) b
where
a.schemaname = b.nspname
and a.tablename = b.relname
and schemaname = 'tjbb'
) loop
execute i.v_grant;
perform db_ods.sp_exec(i.v_sql);
end loop;
end$$;
迁移数据 (datax全量迁移)
- 创建一张表存放生产库的信息;
drop table if exists proddb_conf;
create table proddb_conf (
schemaname varchar(100), --模式名
dbname varchar(100), --数据库名
dbip varchar(100), --数据库ip
dbport varchar(100), --数据库端口
dbuser varchar(100), --用来导数据的数据库用户
dbpwd varchar(100) --数据库用户密码
) distributed by (schemaname);
- 拼接datax的json内容(tool下执行);
drop table if exists datax_json;
create table datax_json as
select
a.*,
'{
"job": {
"setting": {
"speed": {
"channel": 3
},
"errorLimit": {
"record": 0,
"percentage": 0.02
}
},
"content": [
{
"reader": {
"name": "oraclereader",
"parameter": {
"username": "' || b.dbuser || '",
"password": "' || coalesce(b.dbpwd, '') || '",
"column": [
' || a.columnname_old || '
],
"splitPk": "",
"connection": [
{
"table": [
"' || tablename_old || '"
],
"jdbcUrl": [
"jdbc:oracle:thin:@//' || coalesce(b.dbip, '') || ':' || coalesce(b.dbport, '') || '/' || coalesce(b.dbname, '') || '"
]
}
]
}
},
"writer": {
"name": "postgresqlwriter",
"parameter": {
"username": "db_ods",
"password": "db_ods123456",
"column": [
' || a.columnname || '
],
"preSql": [
"delete from ' || tablename || '"
],
"connection": [
{
"table": [
"' || tablename || '"
],
"jdbcUrl": "jdbc:postgresql://${gpip}:${gpport}/${gpdbname}"
}
]
}
}
}
]
}
}'::varchar json_config
from
(
select
b.nspname schemaname,
a.relname tablename,
f.typname,
e.attname,
case
when f.typname like '%int%'
or f.typname like '%char%' then e.attname
else ''
end splitpk,
string_agg('"' || c.attname || '"', ',
' order by c.attnum) columnname,
d.schemaname schemaname_old,
upper(d.tablename) tablename_old,
string_agg('"' || decode(c.attname, 'ctid_new', upper('ctid'), upper(c.attname))|| '"', ',
' order by c.attnum) columnname_old
from
pg_class a
inner join pg_namespace b
on
(a.relnamespace = b.oid)
inner join pg_attribute c
on
(a.oid = c.attrelid)
inner join pg_tables d
on
(a.relname = 'odb_' || schemaname || '_' || tablename)
left join pg_attribute e
on
(a.oid = e.attrelid
and
case
when pg_get_table_distributedby(a.oid) not like '%,%'
and pg_get_table_distributedby(a.oid) not like '%"%' then '"' || replace (replace (pg_get_table_distributedby(a.oid),
'DISTRIBUTED BY (',
''),
')',
'') || '"'
when pg_get_table_distributedby(a.oid) not like '%,%'
and pg_get_table_distributedby(a.oid) like '%"%' then replace (replace (pg_get_table_distributedby(a.oid),
'DISTRIBUTED BY (',
''),
')',
'')
end = '"' || e.attname || '"')
left join pg_type f
on
(e.atttypid = f.oid)
where
c.attnum > 0
and relname like 'odba_%' escape 'a'
and nspname = 'db_ods'
group by
b.nspname,
a.relname,
d.schemaname,
d.tablename,
e.attname,
f.typname
) a
inner join proddb_conf b
on
(a.schemaname_old = b.schemaname)
union all
--mysql
select
a.*,
'{
"job": {
"setting": {
"speed": {
"channel": 3
},
"errorLimit": {
"record": 0,
"percentage": 0.02
}
},
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"username": "' || b.dbuser || '",
"password": "' || coalesce(b.dbpwd, '') || '",
"column": [
' || a.columnname_old || '
],
"splitPk": "' || a.splitpk || '",
"connection": [
{
"table": [
"`' || a.tablename_old || '`"
],
"jdbcUrl": [
"jdbc:mysql://' || coalesce(b.dbip, '') || ':' || coalesce(b.dbport, '') || '/' || coalesce(b.dbname, '') || '"
]
}
]
}
},
"writer": {
"name": "postgresqlwriter",
"parameter": {
"username": "db_ods",
"password": "db_ods123456",
"column": [
' || a.columnname || '
],
"preSql": [
"delete from \"' || tablename || '\""
],
"connection": [
{
"table": [
"\"' || tablename || '\""
],
"jdbcUrl": "jdbc:postgresql://${gpip}:${gpport}/${gpdbname}"
}
]
}
}
}
]
}
}'::varchar json_config
from
(
select
b.nspname schemaname,
a.relname tablename,
f.typname,
e.attname,
case
when f.typname like '%int%' then e.attname
else ''
end splitpk,
string_agg('"\"' || c.attname || '\""', ',
' order by c.attnum) columnname,
d.schemaname schemaname_old,
d.tablename tablename_old,
string_agg('"`' || c.attname || '`"', ',
' order by c.attnum) columnname_old
from
pg_class a
inner join pg_namespace b
on
(a.relnamespace = b.oid)
inner join pg_attribute c
on
(a.oid = c.attrelid)
inner join pg_tables d
on
(a.relname = 'mdb_' || schemaname || '_' || tablename)
left join pg_attribute e
on
(a.oid = e.attrelid
and
case
when pg_get_table_distributedby(a.oid) not like '%,%'
and pg_get_table_distributedby(a.oid) not like '%"%' then '"' || replace (replace (pg_get_table_distributedby(a.oid),
'DISTRIBUTED BY (',
''),
')',
'') || '"'
when pg_get_table_distributedby(a.oid) not like '%,%'
and pg_get_table_distributedby(a.oid) like '%"%' then replace (replace (pg_get_table_distributedby(a.oid),
'DISTRIBUTED BY (',
''),
')',
'')
end = '"' || e.attname || '"')
left join pg_type f
on
(e.atttypid = f.oid)
where
c.attnum > 0
and relname like 'mdba_%' escape 'a'
and nspname = 'db_ods'
group by
b.nspname,
a.relname,
d.schemaname,
d.tablename,
e.attname,
f.typname
) a
inner join proddb_conf b
on
(a.schemaname_old = b.schemaname)
distributed by (schemaname,
tablename);
- 编写java脚本批量生成json文件;
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package createdataxjson;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
*
* @author pn21070310
*/
public class CreateDataxJson {
public static Connection getConnection(String url, String username, String password) {
Connection conn = null;
try {
Class.forName("org.postgresql.Driver");
conn = DriverManager.getConnection(url, username, password);
System.out.println("连接数据库成功!");
} catch (ClassNotFoundException ex) {
Logger.getLogger(CreateDataxJson.class.getName()).log(Level.SEVERE, null, ex);
} catch (SQLException ex) {
Logger.getLogger(CreateDataxJson.class.getName()).log(Level.SEVERE, null, ex);
}
return conn;
}
public static void createFile(String filepath, String filename, String contents, String type) {
boolean append;
append = false;
if ("append".equals(type)) {
append = true;
}
File dir = new File(filepath);
// 一、检查放置文件的文件夹路径是否存在,不存在则创建
if (!dir.exists()) {
dir.mkdirs();// mkdirs创建多级目录
}
File checkFile = new File(filepath + filename);
FileWriter writer = null;
// 二、检查目标文件是否存在,不存在则创建
if (!checkFile.exists()) {
try {
// 创建目标文件
checkFile.createNewFile();
// 三、向目标文件中写入内容
writer = new FileWriter(checkFile, false);
writer.append(contents);
writer.flush();
} catch (IOException ex) {
Logger.getLogger(CreateDataxJson.class.getName()).log(Level.SEVERE, null, ex);
} finally {
if (null != writer) {
try {
writer.close();
} catch (IOException ex) {
Logger.getLogger(CreateDataxJson.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
}
}
public static void main(String[] args) {
Connection cn = null;
cn = getConnection("jdbc:postgresql://192.168.0.1:5432/etl", "tool", "tool123456");
String tablename = null;
String json_config = null;
ResultSet rs = null;
PreparedStatement ps = null;
try {
ps = cn.prepareStatement("select * from datax_json");
rs = ps.executeQuery();
while (rs.next()) {
tablename = rs.getString("tablename");
json_config = rs.getString("json_config");
createFile ("/home/grpcic/datax_json/",tablename+".json",json_config,"overwrite");
}
System.out.println("批量生成文件成功!");
} catch (SQLException ex) {
Logger.getLogger(CreateDataxJson.class.getName()).log(Level.SEVERE, null, ex);
} finally {
if (rs != null) {
try {
rs.close();
} catch (SQLException ex) {
Logger.getLogger(CreateDataxJson.class.getName()).log(Level.SEVERE, null, ex);
}
}
if (ps != null) {
try {
ps.close();
} catch (SQLException ex) {
Logger.getLogger(CreateDataxJson.class.getName()).log(Level.SEVERE, null, ex);
}
}
if (cn != null) {
try {
cn.close();
} catch (SQLException ex) {
Logger.getLogger(CreateDataxJson.class.getName()).log(Level.SEVERE, null, ex);
}
}
System.out.println("数据库连接关闭!");
}
}
}
注:greenplum的jdbc驱动请自行下载;
- 打包java程序成jar,部署到linux上;
- 切到指定目录运行java程序;
[grpcic@gr_vmuat_sjzt_gp01_041115 dist]$ java -jar CreateDataxJson.jar
- 已经批量生成json成功;
- 切换到指定的datax目录,datax执行导数;
[grpcic@gr_vmuat_sjzt_gp01_041115 bin]$ datax.py xxxx.json
配置定时任务
批量执行datax导数,进行全量导表后,重新配置json进行增量导表;然后进行job调度;datax的增量导表只支持where条件的增量数据抽取;
- 生产库源表增加最后更新的时间字段,并作业务层面sql的处理;
- 生产库源表有自增字段,根据自增字段进行增量更新,前提是改表只做插入和删除操作,不进行更新操作;
- 在生产库源表创建dml触发器,这会影响生产库,不建议;
- oracle表提供伪列记录了该记录所在数据块的最新的更新时间;