Maven依赖:
<properties>
<hbase.version>1.2.0</hbase.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-common</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${hbase.version}</version>
</dependency>
</dependencies>
1. Spark读Hbase,newAPI 方式:
object TestSparkReadHbase {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local")
val sc = new SparkContext(sparkConf)
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum","hadoop-master:2181,hadoop-slave2:2181,hadoop-slave3:2181")
hbaseConf.set(TableInputFormat.INPUT_TABLE,"ns1:test")
val rdd = sc.newAPIHadoopRDD(hbaseConf,
classOf[TableInputFormat],
classOf[ImmutableBytesWritable],
classOf[Result])
rdd.foreach{
case (_,t)=>
val key = Bytes.toString(t.getRow)
val name = Bytes.toString(t.getValue("cf1".getBytes(),"name".getBytes()))
val sex = Bytes.toString(t.getValue("cf1".getBytes(),"sex".getBytes()))
val age = Bytes.toString(t.getValue("cf1".getBytes(),"age".getBytes()))
println("Row key:" +key + " name:" + name + " sex:" + sex + " age:" + age)
}
sc.stop()
}
}
2. Spark写Hbase,newAPI 方式:
saveAsNewAPIHadoopDataset( Configuration )
saveAsHadoopDataset( JobConf )
object TestSpark2Hbase {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local")
val sc = new SparkContext(sparkConf)
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum","hadoop-master,hadoop-slave2,hadoop-slave3")
hbaseConf.set("hbase.zookeeper.property.clientPort","2181")
hbaseConf.set(TableOutputFormat.OUTPUT_TABLE,"ns1:test")
val job = Job.getInstance(hbaseConf)
job.setOutputKeyClass(classOf[ImmutableBytesWritable])
job.setOutputValueClass(classOf[Result])
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
val rdd = sc.makeRDD(Array("1,name1,男,23","2,name2,女,20"))
val rdd2 = rdd.map(_.split(",")).map( t => {
val put = new Put(Bytes.toBytes(t(0)))
put.addColumn(Bytes.toBytes("cf1"),Bytes.toBytes("name"),Bytes.toBytes(t(1)))
put.addColumn(Bytes.toBytes("cf1"),Bytes.toBytes("sex"),Bytes.toBytes(t(2)))
put.addColumn(Bytes.toBytes("cf1"),Bytes.toBytes("age"),Bytes.toBytes(t(3)))
(new ImmutableBytesWritable,put)
})
rdd2.saveAsNewAPIHadoopDataset(job.getConfiguration)
sc.stop()
}
}
开发:
object SparkRWHbase {
def main(args: Array[String]): Unit = {
val spark = initSparkSession
val tableName = "ns1:test"
val cf = "cf1"
val columns = Array("rowKey","name","sex","age")
val dataRDD = spark.sparkContext.makeRDD(Array("3,name3,男,21","4,name4,女,20"))
val resRDD = dataRDD.map(_.split(",")).map( t =>{
val rowKey = t(0)
val name = t(1)
val sex = t(2)
val age = t(3)
val cols = Array(rowKey,name,sex,age)
(new ImmutableBytesWritable,getPut(rowKey,cf,columns,cols))
})
resRDD.saveAsNewAPIHadoopDataset(getConf(tableName))
spark.stop()
}
def initSparkSession:SparkSession = SparkSession.builder()
.appName(Constants.SPARK_NAME)
.master("local")
.getOrCreate()
def getConf(tableName:String) = {
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum",Constants.ZOOKEEPER_SERVER_NODE)
conf.set("hbase.zookeeper.property.clientPort","2181")
conf.set(TableOutputFormat.OUTPUT_TABLE,tableName)
conf.setClass("mapreduce.job.outputformat.class",classOf[TableOutputFormat[String]],
classOf[OutputFormat[String,Mutation]])
conf
}
def getPut(rowKey:String,familyName:String,column: Array[String],value:Array[String]) = {
val put = new Put(Bytes.toBytes(rowKey))
for( i <- 0 until (column.length)){
put.addColumn(Bytes.toBytes(familyName),Bytes.toBytes(column(i)),Bytes.toBytes(value(i)))
}
put
}
}