1.HBase和MapReduce的集成
需求:将myuser表当中的f1列族的name和age字段写入待myuser2这张表的f1列族当中去
ImmutableBytesWritable 序列,hbase的存储类型
NullWriter没有数据
context上下文的作用是起到桥梁作用把map阶段处理完的数据传递给reduce阶段
(1)在原有基础上导入集成MR的maven工程
(2)代码实现本地运行
1.创建Mapper类,读取出表myuser的name和age字段,写入到上下文中
package cn.it.hbase.demo;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import java.io.IOException;
import java.util.List;
/**
* 负责myuser表当中的数据
*如果mapper类需要读取hbase表数据,那么我们mapper类需要继承TbaleMapper这样的一个类
*/
public class HBaseSourceMapper extends TableMapper<Text, Put> {
/**
*
* @param key rowkey
* @param value result对象,封装了我们一条条的数据
* @param context 上下文对象
* @throws IOException
* @throws InterruptedException
*
* 需求:读取myuser表当中f1列族下面的name和age列
*/
@Override
//rowkey是个行键,数据类型是序列
protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
//获取到rowkey的字节数组
byte[] bytes = key.get();
String rowkey = Bytes.toString(bytes);
Put put = new Put(bytes);
//获取到所有的cell
List<Cell> cells = value.listCells();
for (Cell cell : cells) {
//获取cell对应的列族
byte[] familyBytes = CellUtil.cloneFamily(cell);
//获取对应的列
byte[] qualifierBytes = CellUtil.cloneQualifier(cell);
//这里判断只需要f1列族,下面的name和ageie
if (Bytes.toString(familyBytes).equals("f1") && Bytes.toString(qualifierBytes).equals("name") || Bytes.toString(qualifierBytes).equals("age")){
put.add(cell);
}
}
//将数据写出去
if (! put.isEmpty()){
//k2,v2
context.write(new Text(rowkey),put);
}
}}
2.创建Reduce类,
package cn.it.hbase.demo;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.io.Text;
import java.io.IOException;
/**
* 负责将数据写入到myuser2
*/
public class HBaseSinkReducer extends TableReducer<Text, Put, ImmutableBytesWritable> {
@Override
protected void reduce(Text key, Iterable<Put> values, Context context) throws IOException, InterruptedException {
for (Put put : values) {
//k3,v3
context.write(new ImmutableBytesWritable(key.toString().getBytes()),put);
}
}
}
3.创建一个程序入口,主类
package cn.it.hbase.demo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class HBaseMain extends Configured implements Tool {
@Override
public int run(String[] strings) throws Exception {
Job job = Job.getInstance(super.getConf(), "habseMR");
//打包运行必须设置main方法所在的主类
job.setJarByClass(HBaseMain.class);
Scan scan = new Scan();
//定义mapper类和reduce类
/*
String table,
Scan scan,
Class<? extends TableMapper> mapper,
Class<?> outputKeyClass,
Class<?> outputValueClass, Job job,
boolean addDependencyJars
*/
//使用工具类初始化Mapper类
TableMapReduceUtil.initTableMapperJob("myuser",
scan,
HBaseSourceMapper.class,
Text.class,
Put.class,
job,
false);
//使用工具类初始化Reducer类
TableMapReduceUtil.initTableReducerJob("myuser2",
HBaseSinkReducer.class,
job);
boolean b = job.waitForCompletion(true);
return b?1:0;
}
//程序的入口类
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
configuration.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
int run = ToolRunner.run(configuration, new HBaseMain(),args);
System.exit(run);
}
}
(2)集群运行
1.在maven中在添加一个打包插件
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<minimizeJar>true</minimizeJar>
</configuration>
</execution>
</executions>
</plugin>
2.打包后上传集群
运行上传的jar包
yarn jar My-hbase-1.0-SNAPSHOT.jar cn.it.hbase.demo.HBaseMain
2.读取hdfs数据写入到hbase当中
(1)设置数据文件上传到hdfs中
cd /export/servers
vim user.txt
0007
0008
0009
hdfs dfs -mkidr -p /hbase/input
hdfs dfs put user.txt /hbase/input
(2)开发MR程序对hdfs读取存储至hbase
1.创建map类
package cn.it.yuge.hdfsToHbase;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* 通过这个mapper读取hdfs上面的文件,然后进行处理
*/
public class HDFSMapper extends Mapper<LongWritable,Text,Text,NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//读取数据之后不做任何处理,直接将数据传到reduce
context.write(value,NullWritable.get());
}
}
2.创建reduce类
package cn.it.yuge.hdfsToHbase;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import javax.management.ImmutableDescriptor;
import java.io.IOException;
/**
* 0007 zhangsan 18
*/
public class HDFSReduce extends TableReducer<Text, NullWritable, ImmutableBytesWritable> {
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
String[] split = key.toString().split("\t");
Put put = new Put(split[0].getBytes());//rowkey对象
//我们对一个rowkey对象设置列族,和列族下面对应的列
put.addColumn("f1".getBytes(),"name".getBytes(),split[1].getBytes());
put.addColumn("f1".getBytes(),"age".getBytes(),split[2].getBytes());
context.write(new ImmutableBytesWritable(split[0].getBytes()),put);
}
}
3.创建程序入口,主类
package cn.it.yuge.hdfsToHbase;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class JobMain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(super.getConf(),"Hdfs2HBase");
job.setJarByClass(JobMain.class);
//MR八步骤
//1.指定读取类和读取路径
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://node01:8020/hbase/input"));
//2.指定map类,和map,k2,v2数据类型
job.setMapperClass(HDFSMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
//分区,排序,规约,分组 Shuffle操作
//设置reduce类
TableMapReduceUtil.initTableReducerJob("myuser2",HDFSReduce.class,job);
boolean b = job.waitForCompletion(true);
return b?1:0;
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
configuration.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
int run = ToolRunner.run(configuration, new JobMain(), args);
System.exit(run);
}
}
这就添加成功了
4.总结
- 如果读取hbase里面的数据 mapper类需要继承TableMapper
- 如果需要读取hdfs上面的文本文件数据,mapper类需要继承Mapper
- 如果要将reduce程序处理完的数据,保存到hbase里面去,reduce类,一定要继承TableReducer
5.通过bulkload的方式批量加载数据到HBase当中去
(1)设置map类
package cn.it.cn.it.BulkLoad;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class HDFSMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
Put put = new Put(split[0].getBytes());
put.addColumn("f1".getBytes(),"name".getBytes(),split[2].getBytes());
put.addColumn("f1".getBytes(),"age".getBytes(),split[2].getBytes());
ImmutableBytesWritable immutableBytesWritable = new ImmutableBytesWritable(split[0].getBytes());
context.write(immutableBytesWritable,put);
}
}
(2)设置主类
package cn.it.cn.it.BulkLoad;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class BulkLoadMain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(super.getConf(),"BulkLoad");
//设置读取文件类,和路径
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://node01:8020/hbase/input"));
Connection connection = ConnectionFactory.createConnection(super.getConf());
Table table = connection.getTable(TableName.valueOf("myuser2"));
//设定map类
job.setMapperClass(HDFSMapper.class);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(Put.class);
//设定输出为HFile类型
//配置增量添加数据
HFileOutputFormat2.configureIncrementalLoad(job,table,connection.getRegionLocator(TableName.valueOf("myuser2")));
job.setOutputFormatClass(HFileOutputFormat2.class);
HFileOutputFormat2.setOutputPath(job,new Path("hdfs:node01:8020/hbase/hfile_out"));
//等待程序执行完毕
boolean b = job.waitForCompletion(true);
return b?1:0;
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
configuration.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
int run = ToolRunner.run(configuration, new BulkLoadMain(),args);
System.exit(run);
}
}
(3)我们的输出路径下面的HFile文件,加载到我们的hbase表当中去
public class LoadData {
public static void main(String[] args) throws Exception {
Configuration configuration = HBaseConfiguration.create();
configuration.set("hbase.zookeeper.property.clientPort", "2181");
configuration.set("hbase.zookeeper.quorum", "node01,node02,node03");
Connection connection = ConnectionFactory.createConnection(configuration);
Admin admin = connection.getAdmin();
Table table = connection.getTable(TableName.valueOf("myuser2"));
LoadIncrementalHFiles load = new LoadIncrementalHFiles(configuration);
load.doBulkLoad(new Path("hdfs://node01:8020/hbase/output_hfile"), admin,table,connection.getRegionLocator(TableName.valueOf("myuser2")));
}
}
3.hive和HBase整合
…
4.HBase的预分区
(1)使用场景
- 当HFile达到10GB的阈值后region和DFile会一份为二,降低单一HFile的占用,但是不管如何去分裂,都是在node01这个单节点上,并没有解决数据倾斜的问题.数据还是一样的多.
- 所以预分区主要是解决HBse数据热点问题,单一region故障问题
其次是: - 增加数据读写效率
- 负载均衡,防止数据倾斜
- 方便集群容灾调度region
- 优化Map数量
(2)如何预分区?
在创建表时提前设置预分区规则
(3)如何设定预分区?
1.手动指定预分区
create 'staff','info','partition1',SPLITS => ['1000','2000','3000','4000']
2.使用16进制算法生成预分区
create 'staff2','info','partition2',{NUMREGIONS => 15, SPLITALGO => 'HexStringSplit'}
numregions 设置有多少个region
splitalgo 按照什么进行划分
HexStringSplit 16进制进行划分
3.使用JavaAPI创建预分区
/**
* 通过javaAPI进行HBase的表的创建以及预分区操作
*/
@Test
public void hbaseSplit() throws IOException {
//获取连接
Configuration configuration = HBaseConfiguration.create();
configuration.set("hbase.zookeeper.quorum", "node01:2181,node02:2181,node03:2181");
Connection connection = ConnectionFactory.createConnection(configuration);
Admin admin = connection.getAdmin();
//自定义算法,产生一系列Hash散列值存储在二维数组中
byte[][] splitKeys = {{1,2,3,4,5},{'a','b','c','d','e'}};
//通过HTableDescriptor来实现我们表的参数设置,包括表名,列族等等
HTableDescriptor hTableDescriptor = new HTableDescriptor(TableName.valueOf("staff3"));
//添加列族
hTableDescriptor.addFamily(new HColumnDescriptor("f1"));
//添加列族
hTableDescriptor.addFamily(new HColumnDescriptor("f2"));
admin.createTable(hTableDescriptor,splitKeys);
admin.close();
}
5.hbase的rowkey设计技巧
- rowkey不宜过长
- rowkey均匀的散列。将数据均匀的落到不同的region里面去,避免所有的数据都落第到一个region里面去了,造成数据热点问题。在rowkey前几位,随机生成一些数字,实现均匀的负载
5.1避免rowkey的热点问题:
- 加盐:在rowkey前面增加随机数具体就是给rowkey分配一个随机前缀以使得它和之前的rowkey的开头不同。分配的前缀种类数量应该和你想使用数据分散到不同的region的数量一致。加盐之后的rowkey就会根据随机生成的前缀分散到各个region上,以避免热点。
- hash取值:对rowkey进行取hashcode
- 反转:对rowkey进行反转
- 时间戳反转:
6.HBase的协处理器
我打一段代码打成jar包,加载到某一个表里面去
(1)通过协处理器的开发,将proc1表当中插入数据之前,将数据保存一份到pro2表里面去
1.HBase当中创建第一张表proc1,第二张表proc2
create 'proc1','info'
create 'proc2','info'
2.协处理器
public class MyProcessor implements RegionObserver,RegionCoprocessor {
static Connection connection = null;
static Table table = null;
//使用静态代码块来创建连接对象
static{
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum","node01:2181");
try {
connection = ConnectionFactory.createConnection(conf);
table = connection.getTable(TableName.valueOf("proc2"));
} catch (Exception e) {
e.printStackTrace();
}
}
private RegionCoprocessorEnvironment env = null;
//定义列族名
private static final String FAMAILLY_NAME = "info";
//定义列明
private static final String QUALIFIER_NAME = "name";
//2.0加入该方法,否则无法生效
@Override
public Optional<RegionObserver> getRegionObserver() {
// Extremely important to be sure that the coprocessor is invoked as a RegionObserver
return Optional.of(this);
}
//初始化协处理器环境
@Override
public void start(CoprocessorEnvironment e) throws IOException {
env = (RegionCoprocessorEnvironment) e;
}
//停止环境,一般不用管它
@Override
public void stop(CoprocessorEnvironment e) throws IOException {
// nothing to do here
}
/**
* 覆写prePut方法,在我们数据插入之前进行拦截,
* @param e
* @param put put对象里面封装了我们需要插入到目标表的数据
* @param edit
* @param durability
* @throws IOException
*/
@Override
public void prePut(final ObserverContext<RegionCoprocessorEnvironment> e,
final Put put, final WALEdit edit, final Durability durability)
throws IOException {
try {
//通过put对象获取插入数据的rowkey
byte[] rowBytes = put.getRow();
String rowkey = Bytes.toString(rowBytes);
//获取我们插入数据的name字段的值
List<Cell> list = put.get(Bytes.toBytes(FAMAILLY_NAME), Bytes.toBytes(QUALIFIER_NAME));
//判断如果没有获取到info列族和name列,直接返回即可
if (list == null || list.size() == 0) {
return;
}
//获取到info列族,name列对应的cell
Cell cell2 = list.get(0);
//通过cell获取数据值
String nameValue = Bytes.toString(CellUtil.cloneValue(cell2));
//创建put对象,将数据插入到proc2表里面去
Put put2 = new Put(rowkey.getBytes());
put2.addColumn(Bytes.toBytes(FAMAILLY_NAME), Bytes.toBytes(QUALIFIER_NAME), nameValue.getBytes());
table.put(put2);
table.close();
} catch (Exception e1) {
return ;
}
}
}
3.打包上传至HDFS
hdfs dfs -mkdir -p /processor
hdfs dfs -put processor.jar /processor
4.将打好的jar包挂载到proc1表当中去
alter 'proc1',METHOD => 'table_att','Coprocessor'=>'hdfs://node01:8020/processor/processor.jar|cn.itcast.hbasemr.demo4.MyProcessor|1001|'
table_att 加载一些属性
Coprecessor :协处理器
hdfs://node01:8020/processor/processor.jar 指定jar包路径
cn.itcast.hbasemr.demo4.MyProcessor 协处理器类的全路径名称
1001 起一个编号,主要是用来区分其他的协处理器
5.proc1表当中添加数据
进入hbase-shell客户端,然后直接执行以下命令向proc1表当中添加数据
put 'proc1','0001','info:name','zhangsan'
put 'proc1','0001','info:age','28'
put 'proc1','0002','info:name','lisi'
put 'proc1','0002','info:age','25'
scan 'proc2'
我们会发现,proc2表当中也插入了数据,并且只有info列族,name列
注意:如果需要卸载我们的协处理器,那么进入hbase的shell命令行,执行以下命令即可
disable 'proc1'
alter 'proc1',METHOD=>'table_att_unset',NAME=>'coprocessor$1'
enable 'proc1'
(2)协处理器分为两大类
- observer:可以对数据进行前置或者后置拦截
- endpoint:可以用来求最大值,最小值,平均值
7.二级索引
一级索引:按照rowkey进行查询
1. get
2. scan startRow stopRow
3. scan 全表