package cn.edu360.day8

import org.apache.spark.sql.{DataFrame, SparkSession}

/**
* Created by zx on 2017/10/16.
*/
object JoinTest {

def main(args: Array[String]): Unit = {


val spark = SparkSession.builder().appName("CsvDataSource")
.master("local[*]")
.getOrCreate()

import spark.implicits._
//import org.apache.spark.sql.functions._

//spark.sql.autoBroadcastJoinThreshold=-1
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
//spark.conf.set("spark.sql.join.preferSortMergeJoin", true)

//println(spark.conf.get("spark.sql.autoBroadcastJoinThreshold"))

val df1 = Seq(
(0, "playing"),
(1, "with"),
(2, "join")
).toDF("id", "token")

val df2 = Seq(
(0, "P"),
(1, "W"),
(2, "S")
).toDF("aid", "atoken")


df2.repartition()

//df1.cache().count()

val result: DataFrame = df1.join(df2, $"id" === $"aid")

//查看执行计划
result.explain()

result.show()

spark.stop()


}

}

spark.conf.set(“spark.sql.autoBroadcastJoinThreshold”, -1)

Broadcast Join默认是10M -1代表不使用,可以修改-1的值

merge join 先进行每个表的排序,然后join

hashShuffle join 对数值取hash到不同分区进行join操作(默认)