前言:
因业务需要,对hbase的读取性能进行研究与优化,现有网上的资料良莠不齐,写的不够清晰,对于初学者而言,很难实现高性能读取habse数据,自己整合资料,耗费一个多星期,完成了hbase高性能读取数据,当然本程序依旧有很大的优化空间,java多线程这一个点就是值得深入思考的经典问题。
hbase读取数据方式:
1、依旧rowkey查找数据
// 根据rowkey查找数据
public static void getData(String tableName, String rowkey, String colFamily, String col) throws IOException {
// init();
Table table = connection.getTable(TableName.valueOf(tableName));
Get get = new Get(Bytes.toBytes(rowkey));
// 获取指定列族数据
get.addFamily(Bytes.toBytes(colFamily));
// 获取指定列数据
get.addColumn(Bytes.toBytes(colFamily),Bytes.toBytes(col));
Result result = table.get(get);
showCell(result);
table.close();
// close();
}
2、给出起始行与结束行批量查询
// 批量查找数据
@SuppressWarnings("deprecation")
public static ArrayList<ArrayList<Data>> scanData(String tableName, String startRow, String stopRow) throws IOException {
// Table table = connection.getTable(TableName.valueOf(tableName));
HTablePool pool = new HTablePool(configuration,1000);
Scan scan = new Scan();
scan.setStartRow(Bytes.toBytes(startRow));
scan.setStopRow(Bytes.toBytes(stopRow));
int Count = 0;
ResultScanner resultScanner = pool.getTable(tableName).getScanner(scan);
ArrayList<ArrayList<Data>> sunDatas = new ArrayList<ArrayList<Data>>();
try{
//内存溢出
for (Result result : resultScanner) {
// result.size();行
ArrayList<Data> sunData = new ArrayList<Data>();
//showCell(result);
//列
Cell[] cells = result.rawCells();
//System.out.println(new String(CellUtil.cloneRow(cells[0])));
for (Cell cell : cells) {
Data aData = new Data();
aData.setColString(new String(CellUtil.cloneRow(cell)));
aData.setRowString(new String(CellUtil.cloneValue(cell)));
sunData.add(aData);
aData = null;
}
sunDatas.add(sunData);
}
}finally{
resultScanner.close();
}
pool.getTable(tableName).close();
return sunDatas;
}
3、getlist指定rowkey集合与需要查询的列(推荐,符合业务需要)
/**
ArrayList<String> rowkeyList , rowkey集合
String[] cols , 需要查的列
String tableName, 表名
int linknum, 表格连接数量
*/
public static ArrayList<ArrayList<String>> getlist(ArrayList<String> rowkeyList , String[] cols ,String tableName,int linknum) throws IOException{
List<Get> getList = new ArrayList<Get>();
HTable[] table = new HTable[linknum];
for (int i = 0; i < linknum; i++) {
table[i] = new HTable(configuration, tableName);
table[i].setScannerCaching(1000);
}
//connection.getTable(TableName.valueOf(tableName));// 获取表
ArrayList<ArrayList<String>> list =new ArrayList<ArrayList<String>>();
for (int i = 0 ; i < rowkeyList.size(); i++){//把rowkey加到get里,再把get装到list中
Get get = new Get(Bytes.toBytes(rowkeyList.get(i)));
for (int j = 0; j < cols.length; j++) {
get.addColumn(Bytes.toBytes("f1"),Bytes.toBytes(cols[j]));
}
getList.add(get);
int link = 0;
int linkc = 0;
if(i % 1000 == 0){
System.out.println(i);
//1000行批量导出一次
link++;
if(link < 10){
linkc = link;
}else if(link % 10 == 0) {
linkc = 10;
}else {
linkc = link % 10;
}
Result[] results = table[linkc].get(getList);//重点在这,直接查getList<Get>
for (Result result : results){//对返回的结果集进行操作
ArrayList<String> list2 = new ArrayList<String>();
for (Cell kv : result.rawCells()) {
String value = Bytes.toString(CellUtil.cloneValue(kv));
list2.add(value);
}
list.add(list2);
}
//getlist清空
getList.clear();
}else if(i == rowkeyList.size() - 1){
System.out.println(i);
Result[] results = table[linknum-1].get(getList);//重点在这,直接查getList<Get>
for (Result result : results){//对返回的结果集进行操作
ArrayList<String> list2 = new ArrayList<String>();
for (Cell kv : result.rawCells()) {
String value = Bytes.toString(CellUtil.cloneValue(kv));
list2.add(value);
}
list.add(list2);
}
//getlist清空
getList.clear();
}
get = null;
}
return list;
}
多线程实践
1、实现runnable接口(返回结果集麻烦)
package com;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.shell.Count;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.BufferedMutator;
import org.apache.hadoop.hbase.client.BufferedMutatorParams;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.HTablePool;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.client.coprocessor.AggregationClient;
import org.apache.hadoop.hbase.client.coprocessor.LongColumnInterpreter;
import org.apache.hadoop.hbase.util.Bytes;
import org.jruby.RubyProcess.Sys;
import org.omg.CORBA.ARG_IN;
import org.apache.hadoop.hbase.mapreduce.RowCounter;
import org.apache.hadoop.util.StopWatch;
import com.csvreader.CsvReader;
import com.github.luben.zstd.Zstd;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
public class DataReaderServer implements Runnable{
private List<String> rowkeyList;
private String tableName;
private String [] cols;
private Configuration configuration;
private HTable table;
ArrayList<DataStract> alldata = new ArrayList<DataStract>();
public DataReaderServer(List<String> rowkeyList,String tableName,String [] cols,Configuration configuration,HTable table){
this.rowkeyList = rowkeyList;
this.tableName = tableName;
this.cols = cols;
this.configuration = configuration;
this.table = table;
}
@Override
public void run() {
// TODO Auto-generated method stub
List<Get> getList = new ArrayList<Get>();
ArrayList<DataStract> data = new ArrayList<DataStract>();
try {
table = new HTable(configuration, "TB001");
table.setScannerCaching(50);
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
//connection.getTable(TableName.valueOf(tableName));// 获取表
//ArrayList<ArrayList<String>> list =new ArrayList<ArrayList<String>>();
System.out.println("rowkey "+rowkeyList.size());
for (int i = 0 ; i < rowkeyList.size(); i++){//把rowkey加到get里,再把get装到list中
Get get = new Get(Bytes.toBytes(rowkeyList.get(i)));
for (int j = 0; j < cols.length; j++) {
get.addColumn(Bytes.toBytes("f1"),Bytes.toBytes(cols[j]));
}
//System.out.println(i);
getList.add(get);
if(i % 1000 == 0 && i > 0){
//System.out.println(i);
try {
Result[] results = table.get(getList);
// System.out.println("结果集"+results.length+" "+getList.size());
for (Result result : results){//对返回的结果集进行操作
ArrayList<String> list2 = new ArrayList<String>();
for (Cell kv : result.rawCells()) {
String value = Bytes.toString(CellUtil.cloneValue(kv));
list2.add(value);
}
//list.add(list2);
//System.out.println("---");
DataStract swap = new DataStract();
swap.setTime(rowkeyList.get(i));
swap.setDalist(list2);
//System.out.println(swap.getDalist().size());
data.add(swap);
//System.out.println("++"+data.size());
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//getlist清空
getList.clear();
}else if(i == rowkeyList.size() - 1){
try {
System.out.println(i);
Result[] results = table.get(getList);
for (Result result : results){//对返回的结果集进行操作
ArrayList<String> list2 = new ArrayList<String>();
for (Cell kv : result.rawCells()) {
String value = Bytes.toString(CellUtil.cloneValue(kv));
list2.add(value);
}
//list.add(list2);
DataStract swap = new DataStract();
swap.setTime(rowkeyList.get(i));
swap.setDalist(list2);
data.add(swap);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//getlist清空
getList.clear();
}
get = null;
}
//总数据容器添加
alldata.addAll(data);
}
}
主接口调用如下
//thread多任务
long st1 = System.currentTimeMillis();
//创建十个表格连接
HTable[] table = new HTable[10];
//开五个线程,总数rowkey分成五份
int avg = 0; //平均
int extra = 0; //额外余数
int t = 0; //循环次数
List<String> thname = new ArrayList<String>();
if(rowkeyArrayList.size() % 5 == 0){
avg = rowkeyArrayList.size() / 5;
t = rowkeyArrayList.size() / avg;
for (int i = 0; i < 5; i++) {
//List<String> rowkeyList,String tableName,String [] cols,Configuration configuration,HTable table
DataReaderServer dataReaderServer = new DataReaderServer(rowkeyArrayList.subList(i*avg, (i+1)*avg),"TB001",head,configuration,table[i]);
Thread thread = new Thread(dataReaderServer);
thread.start();
thname.add(thread.getName());
}
}else {
//avg = rowkeyArrayList.size() / 10;
extra = rowkeyArrayList.size() % 5;
avg = (rowkeyArrayList.size() - extra) / 5;
t = rowkeyArrayList.size() / avg;
for (int i = 0; i < 6; i++) {
if(i<5){
DataReaderServer dataReaderServer = new DataReaderServer(rowkeyArrayList.subList(i*avg, (i+1)*avg),"TB001",head,configuration,table[i]);
Thread thread = new Thread(dataReaderServer);
thread.start();
thname.add(thread.getName());
}else {
DataReaderServer dataReaderServer = new DataReaderServer(rowkeyArrayList.subList(i*avg, rowkeyArrayList.size()),"TB001",head,configuration,table[i]);
Thread thread = new Thread(dataReaderServer);
thread.start();
thname.add(thread.getName());
}
}
}
2、实现callable接口(推荐,有结果集返回)
package com;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.util.Bytes;
public class CallableGetlist implements Callable <ArrayList<DataStract>>{
private List<String> rowkeyList;
private String tableName;
private String [] cols;
private Configuration configuration;
private HTable table;
ArrayList<DataStract> alldata = new ArrayList<DataStract>();
public CallableGetlist(List<String> rowkeyList,String tableName,String [] cols,Configuration configuration,HTable table)
{
this.rowkeyList = rowkeyList;
this.tableName = tableName;
this.cols = cols;
this.configuration = configuration;
this.table = table;
}
@Override
public ArrayList<DataStract> call() throws Exception {
// TODO Auto-generated method stub
List<Get> getList = new ArrayList<Get>();
ArrayList<DataStract> data = new ArrayList<DataStract>();
//connection.getTable(TableName.valueOf(tableName));// 获取表
//ArrayList<ArrayList<String>> list =new ArrayList<ArrayList<String>>();
System.out.println("rowkey "+rowkeyList.size());
for (int i = 0 ; i < rowkeyList.size(); i++){//把rowkey加到get里,再把get装到list中
Get get = new Get(Bytes.toBytes(rowkeyList.get(i)));
for (int j = 0; j < cols.length; j++) {
get.addColumn(Bytes.toBytes("f1"),Bytes.toBytes(cols[j]));
}
//System.out.println(i);
getList.add(get);
if(i % 1000 == 0 && i > 0){
//System.out.println(i);
try {
Result[] results = table.get(getList);
// System.out.println("结果集"+results.length+" "+getList.size());
for (Result result : results){//对返回的结果集进行操作
ArrayList<String> list2 = new ArrayList<String>();
for (Cell kv : result.rawCells()) {
String value = Bytes.toString(CellUtil.cloneValue(kv));
list2.add(value);
}
//list.add(list2);
//System.out.println("---");
DataStract swap = new DataStract();
swap.setTime(rowkeyList.get(i));
swap.setDalist(list2);
//System.out.println(swap.getDalist().size());
data.add(swap);
//System.out.println("++"+data.size());
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//getlist清空
getList.clear();
}else if(i == rowkeyList.size() - 1){
try {
System.out.println(i);
Result[] results = table.get(getList);
for (Result result : results){//对返回的结果集进行操作
ArrayList<String> list2 = new ArrayList<String>();
for (Cell kv : result.rawCells()) {
String value = Bytes.toString(CellUtil.cloneValue(kv));
list2.add(value);
}
//list.add(list2);
DataStract swap = new DataStract();
swap.setTime(rowkeyList.get(i));
swap.setDalist(list2);
data.add(swap);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//getlist清空
getList.clear();
}
get = null;
}
//总数据容器添加
return data;
}
}
主接口调用如下
//实现callable接口
//创建十个表格连接
long st1 = System.currentTimeMillis();
//HTable[] table = new HTable[5];
HTable table = null;
try {
table = new HTable(configuration, "TB001");
table.setScannerCaching(50);
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
ExecutorService pool = Executors.newFixedThreadPool(5);
final int maxRowKeySize = 1000;
int loopSize = rowkeyArrayList.size() % maxRowKeySize == 0 ? rowkeyArrayList.size()
/ maxRowKeySize : rowkeyArrayList.size() / maxRowKeySize + 1;
ArrayList<Future<ArrayList<DataStract>>> results = new ArrayList<Future<ArrayList<DataStract>>>();
for (int loop = 0; loop < loopSize; loop++)
{
int end = (loop + 1) * maxRowKeySize > rowkeyArrayList.size() ? rowkeyArrayList
.size() : (loop + 1) * maxRowKeySize;
List<String> partRowKeys = rowkeyArrayList.subList(loop * maxRowKeySize,
end);
CallableGetlist callableGetlist = new CallableGetlist(partRowKeys,"TB001",head,configuration,table);
synchronized (pool)
{
Future<ArrayList<DataStract>> result = pool.submit(callableGetlist);
results.add(result);
}
}
pool.shutdown();
long et1 = System.currentTimeMillis();
System.out.println("线程执行时间: "+(et1 - st1));
long st2 = System.currentTimeMillis();
ArrayList<ArrayList<DataStract>> datas = new ArrayList<ArrayList<DataStract>>();
ArrayList<DataStract> data = new ArrayList<DataStract>();
try {
data.clear();
for(Future<ArrayList<DataStract>> result:results){
ArrayList<DataStract> t = result.get();
data.addAll(t);
}
datas.add(data);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ExecutionException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("---"+datas.get(0).get(0).getDalist().size());
long et2 = System.currentTimeMillis();
System.out.println("数据连接时间: "+(et2-st2));
总结
依据服务器实际测试:86400行 188列 所有数据获取需要 24秒。
以上的实现代码,有我自己写的实体类没有贴上去,有关list集合容器依据自己的实际情况进行修改。
注意:多线程的性能是受到查询的数据量、CPU性能、网络带宽的影响的,具体应该开多少个线程,怎样优化,欢迎大家讨论。