1、pyspark连接kudu

pyspark --jars /home/oicq/guomm/kudu-spark2_2.11-1.6.0.jar # 启动 
sqlContext = pyspark.sql.SQLContext(spark) # 创建sql连接 
df = sqlContext.read.format('org.apache.kudu.spark.kudu')
               .options(**{"kudu.master":"127.0.0.1:7051", "kudu.table":"python-example"})
               .load() # 读取kudu表

df.write.format('org.apache.kudu.spark.kudu')
        .option('kudu.master', '127.0.0.1:7051')
        .option('kudu.table', 'python-example1')
        .mode('append')
        .save() # 写入kudu表

2、scala spark连接kudu

Pom.xml

<!--spark操作kudu-->
<dependency>
  <groupId>org.apache.kudu</groupId>
  <artifactId>kudu-spark2-tools_2.11</artifactId>
  <version>1.12.0</version>
</dependency>

创建表

/**
* 编写main方法
*/
def main(args: Array[String]): Unit = {
  //设置master
  val KUDU_MASTERS = "localhost"
  
  //使用new KuduClient.KuduClientBuilder创建一个KuduClient ,需要传入master
  val client: KuduClient = new KuduClient.KuduClientBuilder(KUDU_MASTERS).build()
  
  //设置表名
  val tableName = "ods"

  //调用createTable方法,将KuduClient和表名传入
  createTable(client, tableName)
  
  //关闭服务
  client.close()
}


def createTable(client: KuduClient, tableName: String): Unit = {
  //首先我们需要引入java转换
  import scala.collection.JavaConverters._
  
  //写一个java类型的list,里面放入ColumnSchema,用于存放字段信息
  //ColumnSchemaBuilder里有两个参数,分别是字段名和type类型的字段类型
  //后面跟上一个.key函数表示是否为主键
  val columns = List(
    new ColumnSchema.ColumnSchemaBuilder("word", Type.STRING).key(true).build(),
    new ColumnSchema.ColumnSchemaBuilder("cnt", Type.INT32).build()
  ).asJava
  
  //将字段信息封装成一个schema 
  val schema = new Schema(columns)
  
  //这边我们需要新建一个CreateTableOptions用于存放创建表的选项
  val options: CreateTableOptions = new CreateTableOptions()
 
 //设置副本系数为1
  options.setNumReplicas(1)
  
  //这边必须设置addHashPartitions,有两个参数
  //第一个参数指定哪个字段进行分区,可以有多个所以需要一个list
  //第二个设置分为几个桶
  options.addHashPartitions(parcols,3)
  
  //这边新建了一个LinkedList
  val parcols: util.LinkedList[String] = new util.LinkedList[String]()
 
 //设置使用word字段进行分发
  parcols.add("word")
  
  //使用client.createTable进行创建表,有三个参数
  //1.表名
  //2.schema信息
  //3.创建表选项
  client.createTable(tableName,schema,options)
}

修改表名

def renameTable(client: KuduClient, tableName: String, newTableName: String) = {
  //这边需要新建一个AlterTableOptions
  val options: AlterTableOptions = new AlterTableOptions()
 
 //通过options.renameTable,将新表名设置进去
  options.renameTable(newTableName)
  
  //需要调用client.alterTable修改表名,两个参数分别为旧表名,上面新建的AlterTableOptions
  client.alterTable(tableName, options)
}

删除表 

def deleteTable(client: KuduClient, tableName: String) = {
  //直接使用client.deleteTable就可以了
  client.deleteTable(tableName)
}

插入(Insert)

def insertRows(client: KuduClient, tableName: String) = {
  //通过client.openTable打开一个表
  val table: KuduTable = client.openTable(tableName)  
  
  //要进行表操作必须新建一个Session
  val session: KuduSession = client.newSession() 
  
  //通过for循坏插入数据
  for(i<-1 to 10) {
    //通过table.newInsert新建一个插入操作
    val insert: Insert = table.newInsert()
    
    //通过insert.getRow拿到行信息
    val row: PartialRow = insert.getRow
    
    //插入操作
    row.addString("word",s"dam-$i")
    row.addInt("cnt", 100+i)
  	
    //通过session.apply进行运行
    session.apply(insert)
  }
}

插入(Insert)2

cd $SPARK_HOME
bin/spark-shell --packages org.apache.kudu:kudu-spark2_2.11:1.9.0

import org.apache.kudu.client.{KuduPredicate, RowResult}
import org.apache.kudu.spark.kudu.KuduContext
import org.apache.kudu.client.SessionConfiguration

val kuduMaster = "172.26.192.219:7051"

val kuduContext = new KuduContext(kuduMaster, sc)
val kuduClient = kuduContext.syncClient
val kuduTable = kuduClient.openTable("impala::dataone_xishaoye.tbl_order_union")
val kuduSession = kuduClient.newSession()

//AUTO_FLUSH_BACKGROUND AUTO_FLUSH_SYNC MANUAL_FLUSH
kuduSession.setFlushMode(SessionConfiguration.FlushMode.AUTO_FLUSH_SYNC)
kuduSession.setMutationBufferSpace(1000)

val insert = kuduTable.newInsert()
val row = insert.getRow()
row.addString(0, "hello")
kuduSession.apply(insert)
//kuduSession.flush

查询(Select) 

def query(client: KuduClient, tableName: String) = {
  // 打开表
  val table: KuduTable = client.openTable(tableName)
  
  // 要查询需要client.newScannerBuilder(table).build()获得KuduScanner 
  val scanner: KuduScanner = client.newScannerBuilder(table).build()
  
  // while循环数据
  while(scanner.hasMoreRows) {
  	// scanner.nextRows()获得可迭代行数据
    val iterator: RowResultIterator = scanner.nextRows()
  	
	// 迭代这个行数据
    while(iterator.hasNext) {
  	  val result: RowResult = iterator.next()
  	  println(result.getString("word") + " => " + result.getInt("cnt"))
    }
  }
}

更新(Update)

def alterRow(client: KuduClient, tableName: String) = {
  //打开表
  val table: KuduTable = client.openTable(tableName)
  
  //新建session
  val session: KuduSession = client.newSession()
  
  //创建一个update操作
  val update: Update = table.newUpdate()
 
  //获得row
  val row: PartialRow = update.getRow
  
  //这边设置将word为"dam-10"的数据"cnt"设为8888
  row.addString("word", "dam-10")
  row.addInt("cnt", 8888)
  session.apply(update)
}

批量读(Btach Read)

val df = spark.read.format("kudu")
      .options(Map("kudu.master" -> "master:7051", "kudu.table" -> "impala::test_db.test_table"))
      .load
df.createOrReplaceTempView("tmp_table")
spark.sql("select * from tmp_table limit 10").show()

批量写(Batch Write)

import org.apache.kudu.spark.kudu.{KuduContext, KuduWriteOptions}

val kuduMaster = "master:7051"
val table = "impala::test_db.test_table"

val kuduContext = new KuduContext(kuduMaster, sc)

kuduContext.upsertRows(df, table, new KuduWriteOptions(false, true))

单个读/条件读(Read)

cd $SPARK_HOME
bin/spark-shell --packages org.apache.kudu:kudu-spark2_2.11:1.9.0

import org.apache.kudu.client.{KuduPredicate, RowResult}
import org.apache.kudu.spark.kudu.KuduContext

val kuduMaster = "master:7051"
val table = "impala::test_db.test_table"

val kuduContext = new KuduContext(kuduMaster, sc)
val table = kuduContext.syncClient.openTable(table)
val predicate = KuduPredicate.newComparisonPredicate(table.getSchema().getColumn("id"),KuduPredicate.ComparisonOp.EQUAL, "testid")
val scanner = kuduContext.syncClient.newScannerBuilder(table).addPredicate(predicate).build()

scanner.hasMoreRows
val rows = scanner.nextRows
rows.hasNext
val row = rows.next

println(row.getString(0))

综合示例 

package com.is
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.sql.{SQLContext, SparkSession}

import org.apache.kudu.spark.kudu._
import org.apache.kudu.spark

object SparkKuduWrite {
  def main(args:Array[String]) {
    if(args.length < 2){
      println("Usage:SparkKuduWrite <data_path><kudu_table_name><kudu_master_hosts>")
      System.exit(1)
    }
    var data_path = args(0)
    var kudu_table_name = args(1)
    var kudu_master_hosts = args(2)

    println(data_path)
    println(kudu_table_name)
    println(kudu_master_hosts)

    var conf = new SparkConf().setAppName("stra_platform_test")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    val sc = spark.sparkContext

    import spark.implicits._

    val kuduContext = new KuduContext(kudu_master_hosts, sc)
    var df = spark.read.load(data_path)
    # 通过kuduContext可以操作kudu的所有功能
    kuduContext.upsertRows(df, kudu_table_name)
  }
}