create hive table 分隔符 hive分隔符有哪些

转载

mob6454cc73c728 2023-08-18 18:22:33

文章标签 python html5 java 分隔符 apache 文章分类 Hive 大数据

(一）多字节分隔符

应用场景

1、Hive中的分隔符

Hive中默认使用单字节分隔符来加载文本数据，例如逗号、制表符、空格等等，默认的分隔符为\001。根据不同文件的不同分隔符，我们可以通过在创建表时使用 row format delimited fields terminated by ‘单字节分隔符’ 来指定文件中的分割符，确保正确将表中的每一列与文件中的每一列实现一一对应的关系。

create hive table 分隔符 hive分隔符有哪些_python

create hive table 分隔符 hive分隔符有哪些_html5_02

特殊数据

在实际工作中，我们遇到的数据往往不是非常规范化的数据，例如我们会遇到以下的两种情况

情况一：每一行数据的分隔符是多字节分隔符，例如：”||”、“--”等

create hive table 分隔符 hive分隔符有哪些_python_03

上图中每列的分隔符为||，为多字节分隔符

情况二：数据的字段中包含了分隔符

create hive table 分隔符 hive分隔符有哪些_python_04

上图中每列的分隔符为空格，但是数据中包含了分割符，时间字段中也有空格

192.168.88.134 [08/Nov/2020:10:44:32 +0800]

情况一：加载数据的分隔符为多字节分隔符

创建表

--如果表已存在就删除表
drop table if exists singer;
--创建表
create table singer(
 id string,--歌手id
 name string,--歌手名称
 country string,--国家
 province string,--省份
 gender string,--性别
 works string--作品
)
--指定列的分隔符为||
row format delimited fields terminated by '||';

加载数据

load data local inpath '/home/offcn/tmp/test01.txt' into table singer;

查看结果

select * from singer;

数据发生了错位，没有正确的加载每一列的数据

原因

Hive中默认只支持单字节分隔符，无法识别多字节分隔符

情况二：数据中包含了分隔符

创建表

--如果表存在，就删除表
drop table if exists apachelog;
--创建表
create table apachelog(
 ip string,      --IP地址
 stime string,    --时间
 mothed string,  --请求方式
 url string,     --请求地址
 policy string,  --请求协议
 stat string,    --请求状态
 body string     --字节大小
)
--指定列的分隔符为空格
row format delimited fields terminated by ' ';

加载数据

load data local inpath '/home/offcn/tmp/apache_web_access.log' into table apachelog;

查看结果

select * from apachelog;

create hive table 分隔符 hive分隔符有哪些_java_05

问题

时间字段被切分成了两个字段，后面所有的字段出现了错位

原因

时间数据中包含了分隔符，导致Hive认为这是两个字段，但实际业务需求中，为一个字段

解决方案一：替换分隔符

面对情况一，如果数据中的分隔符是多字节分隔符，可以使用程序提前将数据中的多字节分隔符替换为单字节分隔符，然后使用Hive加载，就可以实现正确加载对应的数据。

例如：原始数据中的分隔符为“||”

create hive table 分隔符 hive分隔符有哪些_java_06

程序开发

可以在ETL阶段通过一个MapReduce程序，将“||”替换为单字节的分隔符“|”，示例程序如下：

在mapreduce项目，创建包com.bigdata.hiveetl

package com.bigdata.hiveetl;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

public class ChangeSplitCharMR  extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {
        /**
         * 构建Job
         */
        Job job = Job.getInstance(this.getConf(),"changeSplit");
        job.setJarByClass(ChangeSplitCharMR.class);

        /**
         * 配置Job
         */
        //input：读取需要转换的文件
        job.setInputFormatClass(TextInputFormat.class);
        Path inputPath = new Path("D:/test/hiveetl/input/test01.txt");
        FileInputFormat.setInputPaths(job,inputPath);

        //map：调用Mapper
        job.setMapperClass(ChangeSplitMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        //reduce：不需要Reduce过程
        job.setNumReduceTasks(0);

        //output
        job.setOutputFormatClass(TextOutputFormat.class);
        Path outputPath = new Path("D:/test/hiveetl/out");
        TextOutputFormat.setOutputPath(job,outputPath);

        /**
         * 提交Job
         */
        return job.waitForCompletion(true) ? 0 : -1;

    }

    //程序入口
    public static void main(String[] args) throws Exception {
        //调用run
        Configuration conf = new Configuration();
        int status = ToolRunner.run(conf, new ChangeSplitCharMR(), args);
        System.exit(status);
    }

    public static class ChangeSplitMapper extends Mapper<LongWritable,Text,Text,NullWritable> {
        //定义输出的Key
        private Text outputKey = new Text();
        //定义输出的Value
        private NullWritable outputValue = NullWritable.get();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //获取每条数据
            String line = value.toString();
            //将里面的||转换为|
            String newLine = line.replaceAll("\\|\\|", "|");
            //替换后的内容作为Key
            this.outputKey.set(newLine);
            //输出结果
            context.write(this.outputKey,this.outputValue);
        }
    }


}

重新建表加载数据

重新创建Hive表

--如果表已存在就删除表
drop table if exists singer;
--创建表
create table singer(
 id string,--歌手id
 name string,--歌手名称
 country string,--国家
 province string,--省份
 gender string,--性别
 works string--作品
)
--指定列的分隔符为||
row format delimited fields terminated by '|';

总结

在ETL阶段可以直接对数据进行分隔符的替换，通过替换分隔符将多字节分隔符更改为单字节分隔符，就可以解决数据加载的问题，但是这种方式有对应的优缺点，并不是所有的场景适用于该方法。

优点：实现方式较为简单，基于字符串替换即可

缺点：无法满足情况2的需求

解决方案二：RegexSerDe正则加载

面对情况一和情况二的问题，Hive中提供了一种特殊的方式来解决，Hive提供了一种特殊的Serde来加载特殊数据的问题，使用正则匹配来加载数据，匹配每一列的数据。

官网地址：https://cwiki.apache.org/confluence/display/Hive/GettingStarted#GettingStarted-ApacheWeblogData

什么是SerDe?

Hive的SerDe提供了序列化和反序列化两个功能，SerDe是英文Serialize和Deserilize的组合缩写，用于实现将Hive中的对象进行序列化和将数据进行反序列化。

Serialize就是序列化，用于将Hive中使用的java object转换成能写入hdfs的字节序列，或者其他系统能识别的流文件。Hive中的insert语句用于将数据写入HDFS，所以就会调用序列化实现。Hive中的调用过程如下：

create hive table 分隔符 hive分隔符有哪些_apache_07

Hive中包含的SerDe

官网地址：SerDe - Apache Hive - Apache Software Foundation

RegexSerDe的功能

RegexSerde是Hive中专门为了满足复杂数据场景所提供的正则加载和解析数据的接口，使用RegexSerde可以指定正则表达式加载数据，根据正则表达式匹配每一列数据。上述过程中遇到的情况一和情况二的问题，都可以通过RegexSerDe使用正则表达式来加载实现

egexSerDe解决多字节分隔符

原始数据格式

01||周杰伦||中国||台湾||男||七里香

正则表达式定义每一列

([0-9]*)\\|\\|(.*)\\|\\|(.*)\\|\\|(.*)\\|\\|(.*)\\|\\|(.*)

正则校验

https://c.runoob.com/front-end/854/

create hive table 分隔符 hive分隔符有哪些_分隔符_08

基于正则表达式，使用RegexSerde建表

--如果表已存在就删除表

--如果表已存在就删除表
drop table if exists singer;
--创建表
create table singer(
 id string,--歌手id
 name string,--歌手名称
 country string,--国家
 province string,--省份
 gender string,--性别
 works string--作品
)
--指定使用RegexSerde加载数据
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
--指定正则表达式
WITH SERDEPROPERTIES (
  "input.regex" = "([0-9]*)\\|\\|([^}]*)\\|\\|([^}]*)\\|\\|([^}]*)\\|\\|([^}]*)\\|\\|([^}]*)"
);

加载数据

load data local inpath '/home/offcn/tmp/test01.txt' into table singer;

查看数据结果

select * from singer;

RegexSerDe解决数据中包含分割符
分析数据格式，构建正则表达式

原始数据格式

192.168.88.100 [08/Nov/2020:10:44:33 +0800] "GET /hpsk_sdk/index.html HTTP/1.1" 200 328

正则表达式定义每一列

([^ ]*) ([^}]*) ([^ ]*) ([^ ]*) ([^ ]*) ([0-9]*) ([^ ]*)

正则校验

create hive table 分隔符 hive分隔符有哪些_分隔符_09

基于正则表达式，使用RegexSerde建表

--如果表存在，就删除表
drop table if exists apachelog;
--创建表
create table apachelog(
 ip string,      --IP地址
 stime string,    --时间
 mothed string,  --请求方式
 url string,     --请求地址
 policy string,  --请求协议
 stat string,    --请求状态
 body string     --字节大小
)
--指定使用RegexSerde加载数据
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
--指定正则表达式
WITH SERDEPROPERTIES (
  "input.regex" = "([^ ]*) ([^}]*) ([^ ]*) ([^ ]*) ([^ ]*) ([0-9]*) ([^ ]*)"
);

加载数据

load data local inpath '/home/offcn/tmp/apache_web_access.log' into table apachelog;

查看数据结果

select ip,stime,url,stat,body from apachelog;

RegexSerde使用简单，对于各种复杂的数据场景，都可以通过正则定义匹配每行中的每个字段，基本上可以满足大多数场景的需求，工作中推荐使用该方式来实现对于复杂数据的加载。

总结

当数据文件中出现多字节分隔符或者数据中包含了分隔符时，会导致数据加载与实际表的字段不匹配的问题，基于这个问题我们提供了三种方案：替换分隔符、正则加载及自定义InputFormat来实现，其中替换分隔符无法解决数据中存在分隔符的问题，自定义InputFormat的开发成本较高，所以整体推荐使用正则加载的方式来实现对于特殊数据的处理。

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。