package com.xcu.bigdata.spark.core.pg02_broadcast
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* @Desc : 广播变量的声明和使用
*/
object Spark01_Broadcast {
def main(args: Array[String]): Unit = {
//创建配置文件
val conf: SparkConf = new SparkConf().setAppName("Spark01_Broadcast").setMaster("local[*]")
//创建SparkContext,该对象是提交的入口
val sc = new SparkContext(conf)
//创建RDD
val rdd: RDD[(String, Int)] = sc.makeRDD(List(("a", 1), ("b", 2), ("c", 2)))
val list: List[(String, Int)] = List(("a", 4), ("b", 5), ("c", 6))
//以下的这种方式每一个excutor的不同task都会有一个list,效率极其地下
//实现类似join效果 (a,(1,4)),(b,(2,5)),(c,(3,6))
val resRDD: RDD[(String, (Int, Int))] = rdd.map {
case (k1, v1) => {
var v3 = 0
for ((k2, v2) <- list) {
if (k1 == k2)
v3 = v2
}
(k1, (v1, v3))
}
}
//打印输出
resRDD.collect().foreach(println)
//********************使用广播变量********************
println("**************************************")
//声明广播变量
val broadcastList: Broadcast[List[(String, Int)]] = sc.broadcast(list)
val resRDD1: RDD[(String, (Int, Any))] = rdd.map {
case (k1, v1) => {
var v3 = 0
//获取广播变量的值
for ((k2, v2) <- broadcastList.value) {
if (k1 == k2)
v3 = v2
}
(k1, (v3, v1))
}
}
//打印输出
resRDD1.collect().foreach(println)
//释放资源
sc.stop()
}
}