文章目录


原始数据集

parallelizePairs 将list转换成RDD

SparkConf conf = new SparkConf();
conf.setMaster("local").setAppName("join");
JavaSparkContext sc = new JavaSparkContext(conf);

JavaPairRDD<Integer, String> nameRDD = sc.parallelizePairs(Arrays.asList(
new Tuple2<Integer, String>(0, "aa"),
new Tuple2<Integer, String>(1, "a"),
new Tuple2<Integer, String>(2, "b"),
new Tuple2<Integer, String>(3, "c")
));

JavaPairRDD<Integer, Integer> scoreRDD = sc.parallelizePairs(Arrays.asList(
new Tuple2<Integer, Integer>(1, 100),
new Tuple2<Integer, Integer>(2, 200),
new Tuple2<Integer, Integer>(3, 300),
new Tuple2<Integer, Integer>(4, 400)
));

jion

JavaPairRDD<Integer, Tuple2<String, Integer>> join = nameRDD.join(scoreRDD,3);
System.out.println("join.partitions().size()--------"+join.partitions().size());
join.foreach(new VoidFunction<Tuple2<Integer,Tuple2<String,Integer>>>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Tuple2<Integer, Tuple2<String, Integer>> t)
throws Exception {
System.out.println(t);
}
});
}

结果

(1,(a,100))
(3,(c,300))
(2,(b,200))

注意:​ jion后面增加的是分区数

leftOutJoin

JavaPairRDD<Integer, Tuple2<String, Optional<Integer>>> leftOuterJoin = nameRDD.leftOuterJoin(scoreRDD);
System.out.println("leftOuterJoin.partitions().size()--------"+leftOuterJoin.partitions().size());
leftOuterJoin.foreach(new VoidFunction<Tuple2<Integer, Tuple2<String, Optional<Integer>>>>() {
private static final long serialVersionUID = 1L;
@Override
public void call(
Tuple2<Integer, Tuple2<String, Optional<Integer>>> t)
throws Exception {
Integer key = t._1;
Tuple2<String, Optional<Integer>> tuple = t._2;
Optional<Integer> option = t._2._2;
if (option.isPresent()){
System.out.println(option.get());
}
System.out.println(t);
}
});

结果

(0,(aa,Optional.absent()))
100
(1,(a,Optional.of(100)))
300
(3,(c,Optional.of(300)))
200
(2,(b,Optional.of(200)))

rightOutJoin

代码

JavaPairRDD<Integer, Tuple2<Optional<String>, Integer>> rightOuterJoin = nameRDD.rightOuterJoin(scoreRDD);
System.out.println("leftOuterJoin.partitions().size()--------" + rightOuterJoin.partitions().size());
rightOuterJoin.foreach(new VoidFunction<Tuple2<Integer, Tuple2<Optional<String>, Integer>>>() {
@Override
public void call(Tuple2<Integer, Tuple2<Optional<String>, Integer>> t) throws Exception {
System.out.println(t._2 +"①");
if (t._2._1.isPresent()){
System.out.println(t._2._1.get()+ "②");
}
System.out.println(t + "③");
}
});

结果

(Optional.absent(),400)①
(4,(Optional.absent(),400))③
(Optional.of(a),100)①
a②
(1,(Optional.of(a),100))③
(Optional.of(c),300)①
c②
(3,(Optional.of(c),300))③
(Optional.of(b),200)①
b②
(2,(Optional.of(b),200))③

fullOuterJoin

JavaPairRDD<Integer, Tuple2<Optional<String>, Optional<Integer>>> fullOuterJoin = nameRDD.fullOuterJoin(scoreRDD);
System.out.println("leftOuterJoin.partitions().size()--------" + fullOuterJoin.partitions().size());
fullOuterJoin.foreach(new VoidFunction<Tuple2<Integer, Tuple2<Optional<String>, Optional<Integer>>>>() {
@Override
public void call(Tuple2<Integer, Tuple2<Optional<String>, Optional<Integer>>> t) throws Exception {
Integer integer = t._1;
Tuple2<Optional<String>, Optional<Integer>> optional = t._2;
if (optional._1.isPresent()){
System.out.println(optional._1.get()+"①");
}
if (optional._2.isPresent()){
System.out.println(optional._2.get()+"②");
}
}
});

结果

400②
19/04/23 01:20:32 INFO TaskSchedulerImpl: Removed TaskSet 2.0, whose tasks have all completed, from pool
aa①
a①
19/04/23 01:20:32 INFO DAGScheduler: ResultStage 2 (foreach at OperatorJion.java:35) finished in 0.094 s
100②
c①
300②
b①
200②