union 并集
SparkConf conf = new SparkConf();
conf.setMaster("local").setAppName("union");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1,2,3),3);
JavaRDD<Integer> rdd2 = sc.parallelize(Arrays.asList(4,5,6),2);
JavaRDD<Integer> union = rdd1.union(rdd2);
System.out.println("union.partitions().size()---"+union.partitions().size());
union.foreach(new VoidFunction<Integer>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Integer t) throws Exception {
System.out.println(t);
}
});
sc.stop();
结果
union.partitions().size()---5
1
2
3
4
5
intersection交集
注意RDD间的格式要一致
SparkConf conf = new SparkConf();
conf.setMaster("local").setAppName("intersection");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> rdd1 = sc.parallelize(Arrays.asList("a","b","c"));
JavaRDD<String> rdd2 = sc.parallelize(Arrays.asList("a","e","f"));
JavaRDD<String> intersection = rdd1.intersection(rdd2);
System.out.println(intersection.partitions().size());
intersection.foreach(new VoidFunction<String>() {
private static final long serialVersionUID = 1L;
@Override
public void call(String t) throws Exception {
System.out.println(t);
}
});
sc.stop();
subtract 差集
SparkConf conf = new SparkConf();
conf.setMaster("local").setAppName("subtract");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> rdd1 = sc.parallelize(Arrays.asList("a","b","c"));
JavaRDD<String> rdd2 = sc.parallelize(Arrays.asList("a","e","f"));
//subtract取差集,两个RDD的类型要一致。
// JavaRDD<String> subtract = rdd1.subtract(rdd2);
JavaRDD<String> subtract = rdd2.subtract(rdd1);
subtract.foreach(new VoidFunction<String>() {
private static final long serialVersionUID = 1L;
@Override
public void call(String t) throws Exception {
System.out.println(t);
}
});
sc.stop();
结果
第一个结果
bc
第二个结果
ef