package cn.songjq.hbase.mr.emp;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.RetriesExhaustedWithDetailsException;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.junit.Test;
/**
* 基于HBase数据库上的MapReduce程序编程案例,主要实现以下功能
* 1、读取HBase中员工emp表数据
* 2、对读取的emp表数据进行部门平均工资处理
* 3、将处理后的结果输出到HBase表中
*
* 所有Mapper,Reducer、Job均采用匿名内部类实现
* @author songjq
*
*/
public class DeptAvgSalary {
/**
* 数据准备
* 进行MapReduce程序编写前,需要将关系型数据库Emp表csv文件数据插入到HBase emp表中,并创建存放处理结果的表statistics
* @throws IOException
* @throws ZooKeeperConnectionException
* @throws MasterNotRunningException
*/
@Test
public void init() throws MasterNotRunningException, ZooKeeperConnectionException, IOException {
//获取HBase数据库连接对象
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", "hadoop-server01:2181,hadoop-server02:2181,hadoop-server03:2181");
HBaseAdmin hbaseAdmin = new HBaseAdmin(conf);
//创建emp表并导入emp.csv文件数据
if(!hbaseAdmin.tableExists("emp")) {
//表不存在,创建表,并初始化表的数据
TableName tableName = TableName.valueOf("emp");
//表描述符
HTableDescriptor empTable = new HTableDescriptor(tableName);
//列描述符
HColumnDescriptor f1 = new HColumnDescriptor("empinfo");
//保留3个版本数据
f1.setMaxVersions(3);
empTable.addFamily(f1);
hbaseAdmin.createTable(empTable);
//初始化数据
HTable hTable = new HTable(conf, "emp");
putDataToEmp(hTable);
}
//创建statistics表
if(!hbaseAdmin.tableExists("statistics")) {
//表不存在,创建表,并初始化表的数据
TableName tableName = TableName.valueOf("statistics");
//表描述符
HTableDescriptor statisticsTable = new HTableDescriptor(tableName);
//列描述符
HColumnDescriptor f1 = new HColumnDescriptor("emp_stat");
//保留3个版本数据
f1.setMaxVersions(3);
statisticsTable.addFamily(f1);
hbaseAdmin.createTable(statisticsTable);
}
hbaseAdmin.close();
}
/**
* 插入数据
* @throws IOException
* @throws RetriesExhaustedWithDetailsException
*/
public static void putDataToEmp(HTable hTable) throws RetriesExhaustedWithDetailsException, IOException {
List<Put> puts = new ArrayList<>();
//构造一个数据流,emp表的csv文件
FileInputStream filein = new FileInputStream("D:\\test\\hbase\\emp.csv");
InputStreamReader fileinReader = new InputStreamReader(filein);
BufferedReader br = new BufferedReader(fileinReader);
String line = null;
while((line=br.readLine())!=null) {
//得到一行数据
String[] split = line.split(",");
//7499,ALLEN,SALESMAN,7698,1981/2/20,1600,300,30
int empno = Integer.valueOf(split[0]);
String ename = split[1];
String job = split[2];
int mgr = 0;
try {
mgr = Integer.valueOf(split[3]);
}catch (Exception e) {
mgr = 0;
}
String hiredate = split[4];
float salary = Float.valueOf(split[5]);
float comm = 0f;
try {
comm = Float.valueOf(split[6]);
}catch (Exception e) {
comm = 0f;
}
int deptno = Integer.valueOf(split[7]);
//行键rowkey
Put put = new Put(Bytes.toBytes(empno));
//列族
put.add(Bytes.toBytes("empinfo"), Bytes.toBytes("ename"), Bytes.toBytes(ename));
put.add(Bytes.toBytes("empinfo"), Bytes.toBytes("job"), Bytes.toBytes(job));
put.add(Bytes.toBytes("empinfo"), Bytes.toBytes("mgr"), Bytes.toBytes(mgr));
put.add(Bytes.toBytes("empinfo"), Bytes.toBytes("hiredate"), Bytes.toBytes(hiredate));
put.add(Bytes.toBytes("empinfo"), Bytes.toBytes("salary"), Bytes.toBytes(salary));
put.add(Bytes.toBytes("empinfo"), Bytes.toBytes("comm"), Bytes.toBytes(comm));
put.add(Bytes.toBytes("empinfo"), Bytes.toBytes("deptno"), Bytes.toBytes(deptno));
puts.add(put);
}
hTable.put(puts);
br.close();
fileinReader.close();
filein.close();
}
/**
* Mapper端:
* 这里不继承Mapper类,而是集成TableMapper<KEYOUT, VALUEOUT>
* KEYOUT:输出k2
* VALUEOUT:输出v2
* 由于输入是HBase的一个rowkey记录,因此没有<k1,v1>
* @author songjq
*
*/
static class DeptAvgSalaryMapper extends TableMapper<IntWritable, FloatWritable> {
/*
* 没输入一条rowkey记录,就调用一次map方法
* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
*/
@Override
protected void map(ImmutableBytesWritable rowkey, Result rs,
Context context)
throws IOException, InterruptedException {
//获取员工部门号
int deptno = Bytes.toInt(rs.getValue(Bytes.toBytes("empinfo"), Bytes.toBytes("deptno")));
//获取员工薪水
float salary = Bytes.toFloat(rs.getValue(Bytes.toBytes("empinfo"), Bytes.toBytes("salary")));
//将获取结果写出去
context.write(new IntWritable(deptno), new FloatWritable(salary));
}
}
/**
* Reducer端:
* 这里不继承Reducer类,而是继承HBase的TableReducer<KEYIN, VALUEIN, KEYOUT>类
* KEYIN:Mapper输出k2
* VALUEIN:Mapper输出v2集合
* KEYOUT:Reducer的输出,按照rowkey输出到HBase表
* @author songjq
*
*/
static class DeptAvgSalaryReducer extends TableReducer<IntWritable, FloatWritable, ImmutableBytesWritable> {
/*
* 相同的key会调用一次reduce方法
* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
*/
@Override
protected void reduce(IntWritable k3, Iterable<FloatWritable> v3,
Context ctx)
throws IOException, InterruptedException {
//定义部门平均工资
float deptAvgSal = 0f;
float deptTotalSal = 0f;
int count = 0;
Iterator<FloatWritable> iterator = v3.iterator();
while(iterator.hasNext()) {
FloatWritable sal = iterator.next();
deptTotalSal += sal.get();
count++;
}
//求部门平均工资
deptAvgSal = deptTotalSal/count;
/*
* 将处理结果写入HBase statistics表
*/
//创建一个Put对象,并指定rowkey的值,这里使用部门号deptno作为rowkey
Put put = new Put(Bytes.toBytes(k3.get()));
//往put对象添加插入的列数据 emp_stat->列族 dept_avg_sal->部门平均工资列
put.add(Bytes.toBytes("emp_stat"), Bytes.toBytes("dept_avg_sal"), Bytes.toBytes(deptAvgSal));
//输出到HBase,指定输出的rowkey,输出put
ctx.write(new ImmutableBytesWritable(Bytes.toBytes(k3.get())), put);
}
}
/**
* Main方法提交Job
* 提交到Hadoop可能会报错Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/hadoop/hbase/client/HTable
* 因此报此错误需要设置export HADOOP_CLASSPATH=$HBASE_HOME/lib/*:$CLASSPATH
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
//配置HBase客户端连接信息
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", "hadoop-server01:2181,hadoop-server02:2181,hadoop-server03:2181");
Job job = Job.getInstance(conf);
job.setJarByClass(DeptAvgSalary.class);
//指定任务的Mapper
/*
* TableMapReduceUtil.initTableMapperJob(
* table, 输入的表
* scan, 指定扫描器
* mapper, 指定Mapper类
* outputKeyClass, 输出key类型
* outputValueClass, 输出value类型
* job 任务job
* );
*/
//创建一个扫描器
Scan scan = new Scan();
//指定要输入的列,如果列太多,不建议查询出来做为输入,根据要处理的列进行输入,这里只查询deptno和salary作为mapper的输入
scan.addColumn(Bytes.toBytes("empinfo"), Bytes.toBytes("deptno"));
scan.addColumn(Bytes.toBytes("empinfo"), Bytes.toBytes("salary"));
TableMapReduceUtil.initTableMapperJob(Bytes.toBytes("emp"),
scan,
DeptAvgSalaryMapper.class,
IntWritable.class,
FloatWritable.class,
job);
//指定任务的Reducer
/*
* TableMapReduceUtil.initTableReducerJob(
* table, 输出的表
* reducer, Reducer类
* job) 任务job
*/
TableMapReduceUtil.initTableReducerJob(
"statistics",
DeptAvgSalaryReducer.class,
job);
//提交任务
job.waitForCompletion(true);
}
/**
* 查看输出到HBase中的数据
* 执行结果:
* 部门编号 平均工资
10 2916.6667
20 2576.5
30 1729.5834
* @throws Exception
*/
@Test
public void scanStatisticsInfo() throws Exception {
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", "hadoop-server01:2181,hadoop-server02:2181,hadoop-server03:2181");
HTable hTable = new HTable(conf, "statistics");
Scan scan = new Scan();
ResultScanner scanner = hTable.getScanner(scan);
Iterator<Result> iterator = scanner.iterator();
System.out.println("部门编号\t\t平均工资");
while(iterator.hasNext()) {
Result rs = iterator.next();
System.out.println(Bytes.toInt(rs.getRow())+"\t\t"+
Bytes.toFloat(rs.getValue(Bytes.toBytes("emp_stat"), Bytes.toBytes("dept_avg_sal"))));
}
hTable.close();
}
}