Union
package com.shujia.spark.core import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object Demo8Union { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf() .setAppName("map") .setMaster("local") //spark 上下文对象 val sc = new SparkContext(conf) val rdd1: RDD[Int] = sc.parallelize(List(1, 2, 3, 4, 5, 6)) val rdd2: RDD[Int] = sc.parallelize(List(4, 5, 6, 7, 8, 9)) /** * union: 合并两个RDD ,rdd的类型必须一致。不会去重 * */ val unionRDD: RDD[Int] = rdd1.union(rdd2) /** *去重 * */ val distinctRDD: RDD[Int] = unionRDD.distinct() distinctRDD.foreach(println) } }
Join
package com.shujia.spark.core import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object Demo9Join { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf() .setAppName("map") .setMaster("local") //spark 上下文对象 val sc = new SparkContext(conf) //读取学生表 val students: RDD[String] = sc.textFile("data/students.txt") //读取分数表 val scores: RDD[String] = sc.textFile("data/score.txt") //将rdd转换成kv格式 val studentkvRDD: RDD[(String, String)] = students.map(student => { val split: Array[String] = student.split(",") val id: String = split(0) //一学号作为key,学生信息作为value (id, student) }) val scoreKVRDD: RDD[(String, String)] = scores.map(score => { val split: Array[String] = score.split(",") val id: String = split(0) //一学号作为key ,学生信息作为value (id, score) }) /** * join: 默认是内连接 * 通过key进行关联 */ val innerJoinRDD: RDD[(String, (String, String))] = studentkvRDD.join(scoreKVRDD) //关联之后整理数据 val resultRDD: RDD[(String, String)] =innerJoinRDD.map { case (id: String, (studentInfo: String, scoreInfo: String)) => val name: String = studentInfo.split(",")(1) val score: String = scoreInfo.split(",")(2) (name, score) } /** * leftOuterJoin: 一左边为基础,如果右边没有用null 代替 * */ val leftOuterJoinRDD: RDD[(String, (String, Option[String]))] = studentkvRDD.leftOuterJoin(scoreKVRDD) val leftOuterResultRDD: RDD[String] =leftOuterJoinRDD.map{ //关联上的处理方式 case (id:String, (studentInfo:String, Some(scoreInfo)))=> studentInfo + "\t" + scoreInfo //没有关联上的处理方式 case (id:String, (studentInfo:String, None))=> studentInfo + "\t" + "默认" } leftOuterResultRDD.foreach(println) /** * fullOuterJoin */ val fullOuterJoinRDD: RDD[(String, (Option[String], Option[String]))] = studentkvRDD.fullOuterJoin(scoreKVRDD) fullOuterJoinRDD.map { //关联上处理方式 case (id: String, (Some(studentInfo), Some(scoreInfo))) => studentInfo + "\t" + scoreInfo //没有关联上处理方式 case (id: String, (Some(studentInfo), None)) => studentInfo + "\t" + "默认" //没有关联上处理方式 case (id: String, (None, Some(scoreInfo))) => "默认" + "\t" + scoreInfo } } }