1、pyspark连接kudu
pyspark --jars /home/oicq/guomm/kudu-spark2_2.11-1.6.0.jar # 启动
sqlContext = pyspark.sql.SQLContext(spark) # 创建sql连接
df = sqlContext.read.format('org.apache.kudu.spark.kudu')
.options(**{"kudu.master":"127.0.0.1:7051", "kudu.table":"python-example"})
.load() # 读取kudu表
df.write.format('org.apache.kudu.spark.kudu')
.option('kudu.master', '127.0.0.1:7051')
.option('kudu.table', 'python-example1')
.mode('append')
.save() # 写入kudu表
2、scala spark连接kudu
Pom.xml
<!--spark操作kudu-->
<dependency>
<groupId>org.apache.kudu</groupId>
<artifactId>kudu-spark2-tools_2.11</artifactId>
<version>1.12.0</version>
</dependency>
创建表
/**
* 编写main方法
*/
def main(args: Array[String]): Unit = {
//设置master
val KUDU_MASTERS = "localhost"
//使用new KuduClient.KuduClientBuilder创建一个KuduClient ,需要传入master
val client: KuduClient = new KuduClient.KuduClientBuilder(KUDU_MASTERS).build()
//设置表名
val tableName = "ods"
//调用createTable方法,将KuduClient和表名传入
createTable(client, tableName)
//关闭服务
client.close()
}
def createTable(client: KuduClient, tableName: String): Unit = {
//首先我们需要引入java转换
import scala.collection.JavaConverters._
//写一个java类型的list,里面放入ColumnSchema,用于存放字段信息
//ColumnSchemaBuilder里有两个参数,分别是字段名和type类型的字段类型
//后面跟上一个.key函数表示是否为主键
val columns = List(
new ColumnSchema.ColumnSchemaBuilder("word", Type.STRING).key(true).build(),
new ColumnSchema.ColumnSchemaBuilder("cnt", Type.INT32).build()
).asJava
//将字段信息封装成一个schema
val schema = new Schema(columns)
//这边我们需要新建一个CreateTableOptions用于存放创建表的选项
val options: CreateTableOptions = new CreateTableOptions()
//设置副本系数为1
options.setNumReplicas(1)
//这边必须设置addHashPartitions,有两个参数
//第一个参数指定哪个字段进行分区,可以有多个所以需要一个list
//第二个设置分为几个桶
options.addHashPartitions(parcols,3)
//这边新建了一个LinkedList
val parcols: util.LinkedList[String] = new util.LinkedList[String]()
//设置使用word字段进行分发
parcols.add("word")
//使用client.createTable进行创建表,有三个参数
//1.表名
//2.schema信息
//3.创建表选项
client.createTable(tableName,schema,options)
}
修改表名
def renameTable(client: KuduClient, tableName: String, newTableName: String) = {
//这边需要新建一个AlterTableOptions
val options: AlterTableOptions = new AlterTableOptions()
//通过options.renameTable,将新表名设置进去
options.renameTable(newTableName)
//需要调用client.alterTable修改表名,两个参数分别为旧表名,上面新建的AlterTableOptions
client.alterTable(tableName, options)
}
删除表
def deleteTable(client: KuduClient, tableName: String) = {
//直接使用client.deleteTable就可以了
client.deleteTable(tableName)
}
插入(Insert)
def insertRows(client: KuduClient, tableName: String) = {
//通过client.openTable打开一个表
val table: KuduTable = client.openTable(tableName)
//要进行表操作必须新建一个Session
val session: KuduSession = client.newSession()
//通过for循坏插入数据
for(i<-1 to 10) {
//通过table.newInsert新建一个插入操作
val insert: Insert = table.newInsert()
//通过insert.getRow拿到行信息
val row: PartialRow = insert.getRow
//插入操作
row.addString("word",s"dam-$i")
row.addInt("cnt", 100+i)
//通过session.apply进行运行
session.apply(insert)
}
}
插入(Insert)2
cd $SPARK_HOME
bin/spark-shell --packages org.apache.kudu:kudu-spark2_2.11:1.9.0
import org.apache.kudu.client.{KuduPredicate, RowResult}
import org.apache.kudu.spark.kudu.KuduContext
import org.apache.kudu.client.SessionConfiguration
val kuduMaster = "172.26.192.219:7051"
val kuduContext = new KuduContext(kuduMaster, sc)
val kuduClient = kuduContext.syncClient
val kuduTable = kuduClient.openTable("impala::dataone_xishaoye.tbl_order_union")
val kuduSession = kuduClient.newSession()
//AUTO_FLUSH_BACKGROUND AUTO_FLUSH_SYNC MANUAL_FLUSH
kuduSession.setFlushMode(SessionConfiguration.FlushMode.AUTO_FLUSH_SYNC)
kuduSession.setMutationBufferSpace(1000)
val insert = kuduTable.newInsert()
val row = insert.getRow()
row.addString(0, "hello")
kuduSession.apply(insert)
//kuduSession.flush
查询(Select)
def query(client: KuduClient, tableName: String) = {
// 打开表
val table: KuduTable = client.openTable(tableName)
// 要查询需要client.newScannerBuilder(table).build()获得KuduScanner
val scanner: KuduScanner = client.newScannerBuilder(table).build()
// while循环数据
while(scanner.hasMoreRows) {
// scanner.nextRows()获得可迭代行数据
val iterator: RowResultIterator = scanner.nextRows()
// 迭代这个行数据
while(iterator.hasNext) {
val result: RowResult = iterator.next()
println(result.getString("word") + " => " + result.getInt("cnt"))
}
}
}
更新(Update)
def alterRow(client: KuduClient, tableName: String) = {
//打开表
val table: KuduTable = client.openTable(tableName)
//新建session
val session: KuduSession = client.newSession()
//创建一个update操作
val update: Update = table.newUpdate()
//获得row
val row: PartialRow = update.getRow
//这边设置将word为"dam-10"的数据"cnt"设为8888
row.addString("word", "dam-10")
row.addInt("cnt", 8888)
session.apply(update)
}
批量读(Btach Read)
val df = spark.read.format("kudu")
.options(Map("kudu.master" -> "master:7051", "kudu.table" -> "impala::test_db.test_table"))
.load
df.createOrReplaceTempView("tmp_table")
spark.sql("select * from tmp_table limit 10").show()
批量写(Batch Write)
import org.apache.kudu.spark.kudu.{KuduContext, KuduWriteOptions}
val kuduMaster = "master:7051"
val table = "impala::test_db.test_table"
val kuduContext = new KuduContext(kuduMaster, sc)
kuduContext.upsertRows(df, table, new KuduWriteOptions(false, true))
单个读/条件读(Read)
cd $SPARK_HOME
bin/spark-shell --packages org.apache.kudu:kudu-spark2_2.11:1.9.0
import org.apache.kudu.client.{KuduPredicate, RowResult}
import org.apache.kudu.spark.kudu.KuduContext
val kuduMaster = "master:7051"
val table = "impala::test_db.test_table"
val kuduContext = new KuduContext(kuduMaster, sc)
val table = kuduContext.syncClient.openTable(table)
val predicate = KuduPredicate.newComparisonPredicate(table.getSchema().getColumn("id"),KuduPredicate.ComparisonOp.EQUAL, "testid")
val scanner = kuduContext.syncClient.newScannerBuilder(table).addPredicate(predicate).build()
scanner.hasMoreRows
val rows = scanner.nextRows
rows.hasNext
val row = rows.next
println(row.getString(0))
综合示例
package com.is
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.kudu.spark.kudu._
import org.apache.kudu.spark
object SparkKuduWrite {
def main(args:Array[String]) {
if(args.length < 2){
println("Usage:SparkKuduWrite <data_path><kudu_table_name><kudu_master_hosts>")
System.exit(1)
}
var data_path = args(0)
var kudu_table_name = args(1)
var kudu_master_hosts = args(2)
println(data_path)
println(kudu_table_name)
println(kudu_master_hosts)
var conf = new SparkConf().setAppName("stra_platform_test")
val spark = SparkSession.builder().config(conf).getOrCreate()
val sc = spark.sparkContext
import spark.implicits._
val kuduContext = new KuduContext(kudu_master_hosts, sc)
var df = spark.read.load(data_path)
# 通过kuduContext可以操作kudu的所有功能
kuduContext.upsertRows(df, kudu_table_name)
}
}