import org.apache.spark.rdd.RDD
import org.apache.spark.{HashPartitioner, Partitioner, SparkConf, SparkContext}
object TransformationOperator {
/**
* map算子
* 遍历rdd中的每一个元素,可以对元素进行操作
*/
def Map: Unit ={
val conf = new SparkConf().setAppName("map").setMaster("local")
val sc = new SparkContext(conf)
val list = List("张无忌","赵敏","周芷若")
val RDD: RDD[String] = sc.parallelize(list)
val unit = RDD.map(name => "hello" + name)
unit.foreach(nn => println(nn))
}
/**
* 结果:
* hello张无忌
* hello赵敏
* hello周芷若
*/
/**
* flatMap算子
* 将数组扁平化,即降维处理
*/
def FlatMap: Unit ={
val conf = new SparkConf().setAppName("flatMap").setMaster("local")
val sc = new SparkContext(conf)
val list: List[String] = List("张无忌 赵敏","宋青书 周芷若")
val rdd: RDD[String] = sc.parallelize(list)
val line: RDD[String] = rdd.flatMap(line => line.split(" ").map(name =>"hello" + name))
// val name: RDD[String] = rdd.map("hello" + line)
line.foreach(nn => println(nn))
}
/**
* 结果:
* hello张无忌
* hello赵敏
* hello宋青书
* hello周芷若
*/
/**
* filter算子
* 过滤函数,将rdd中满足一定条件的值选择出来
*/
def Filter: Unit ={
val conf = new SparkConf().setAppName("filter").setMaster("local")
val sc = new SparkContext(conf)
val list: List[Int] = List(1,2,3,4,5,6,7,8)
val rdd: RDD[Int] = sc.parallelize(list)
rdd.filter(num => num % 2 == 0).foreach(nn => println(nn))
}
/**
* 结果:
* 2
* 4
* 6
* 8
*/
/**
* groupBykey算子
* 按照key进行分组,可以指定numPartitions来指定task数目
*/
def GroupBykey: Unit ={
val conf = new SparkConf().setAppName("groupBykey").setMaster("local")
val sc = new SparkContext(conf)
val list: List[(String, String)] = List(
new Tuple2("峨眉", "周芷若"),
new Tuple2("武当", "宋青书"),
new Tuple2("峨眉", "灭绝师太"),
new Tuple2("武当", "张三丰")
)
val rdd: RDD[(String, String)] = sc.parallelize(list)
rdd.groupByKey().foreach(menpai => {print("menpai" + menpai._1 + " ");menpai._2.foreach(nn => print(nn));println()} )
}
/**
* 结果:
* menpai峨眉 周芷若灭绝师太
* menpai武当 宋青书张三丰
*/
/**
* reduceBykey算子
* 对分组后的数据进行处理,求和,求最值等
* 可以指定分区数目
*/
def ReduceBykey: Unit ={
val conf = new SparkConf().setAppName("reduceBykey").setMaster("local")
val sc: SparkContext = new SparkContext(conf)
val list: List[(String, Int)] = List(
new Tuple2("峨眉", 40),
new Tuple2("武当", 30),
new Tuple2("峨眉", 60),
new Tuple2("武当", 99)
)
val rdd: RDD[(String, Int)] = sc.parallelize(list)
rdd.reduceByKey((v1,v2) => v1 + v2).foreach(tuple => println(tuple._1 + tuple._2))
}
/**
* 结果:
* 峨眉100
* 武当129
*/
/**
* sortBykey算子
* 按照key进行排序
* 参数1 true为升序 false为降序
* 参数2 指定分区数目 可选
*/
def SortBykey: Unit ={
val conf = new SparkConf().setAppName("sortBykey").setMaster("local")
val sc = new SparkContext(conf)
val list: List[(Int, String)] = List((98,"东方不败"),(80,"岳不群"),(85,"令狐冲"),(83,"任我行"))
val rdd: RDD[(Int, String)] = sc.parallelize(list)
rdd.sortByKey(false).foreach(tuple => println(tuple._1 + "->" + tuple._2))
}
/**
* 结果:
* 98->东方不败
* 85->令狐冲
* 83->任我行
* 80->岳不群
*/
/**
* join算子
* 按照key值连接两个rdd,可以指定分区(可选)
* 结果(d,(4,d4)) (b,(2,b2)) ------- (a,(1,a1)) (c,(3,c3))
*/
def Join: Unit ={
val conf = new SparkConf().setAppName("join").setMaster("local")
val sc = new SparkContext(conf)
val list1: List[(Int, String)] = List((1, "东方不败"),(2, "令狐冲"),(3, "林平之"))
val list2: List[(Int, Int)] = List((1, 99),(2, 98),(3, 97))
val rdd1: RDD[(Int, String)] = sc.parallelize(list1)
val rdd2: RDD[(Int, Int)] = sc.parallelize(list2)
rdd1.join(rdd2)
.foreach(tuple => println("学号:" + tuple._1 + " 名字:" + tuple._2._1 + " 分数:" + tuple._2._2))
}
/**
* 结果:
* 学号:1 名字:东方不败 分数:99
* 学号:3 名字:林平之 分数:97
* 学号:2 名字:令狐冲 分数:98
*/
/**
* union算子
* 连接两个类型相同的rdd,即拼接
*/
def Union: Unit ={
val conf = new SparkConf().setAppName("union").setMaster("local")
val sc = new SparkContext(conf)
val list1: List[Int] = List(1, 2, 3, 4)
val list2: List[Int] = List(3, 4, 5, 6)
val rdd1: RDD[Int] = sc.parallelize(list1)
val rdd2: RDD[Int] = sc.parallelize(list2)
rdd1.union(rdd2).foreach(nn => println(nn))
}
/**
* 结果:
* 1
* 2
* 3
* 4
* 18/04/23 20:54:32 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 751 bytes result sent to driver
* 18/04/23 20:54:32 INFO TaskSetManager: Starting task 1.0 in stage 0.0 (TID 1, localhost, executor driver, partition 1, PROCESS_LOCAL, 7905 bytes)
* 18/04/23 20:54:32 INFO Executor: Running task 1.0 in stage 0.0 (TID 1)
* 3
* 4
* 5
* 6
*
*/
/**
* intersection算子
* A intersection B 求AB的交集
*/
def Intersection: Unit ={
val conf: SparkConf = new SparkConf().setAppName("intersection").setMaster("local")
val sc: SparkContext = new SparkContext(conf)
val list1: List[Int] = List(1, 2, 3, 4)
val list2: List[Int] = List(3, 4, 5, 6)
val rdd1: RDD[Int] = sc.parallelize(list1)
val rdd2: RDD[Int] = sc.parallelize(list2)
rdd1.intersection(rdd2).foreach(nn => println(nn))
}
/**
* 结果:
* 4
* 3
*/
/**
* distinct算子
* 返回包含源数据集的不同元素的新数据集。
* 去重,需要整体相同才进行去重
*/
def Distinct: Unit ={
val conf = new SparkConf().setAppName("distinct").setMaster("local")
val sc = new SparkContext(conf)
val list = List(1, 2, 3,3,4,4)
val rdd = sc.parallelize(list)
rdd.distinct().foreach(nn => println(nn))
}
/**
* 结果:
* 4
* 1
* 3
* 2
*/
/**
* cartesian算子
* 笛卡尔积
*
*/
def Cartesian: Unit ={
val conf = new SparkConf().setAppName("cartesian").setMaster("local")
val sc = new SparkContext(conf)
val list1 = List("a", "b")
val list2 = List(0, 1, 2)
val rdd1 = sc.parallelize(list1)
val rdd2 = sc.parallelize(list2)
rdd1.cartesian(rdd2).foreach(tuple => println(tuple._1 + "->" + tuple._2))
}
/**
* 结果:
* a->0
* a->1
* a->2
* b->0
* b->1
* b->2
*/
/**
* mapPartitions算子
* 一次获取的是一个分区的数据(hdfs)
* 类似于map,map是一次读取RDD中的一个值,mapPartitions是读取一个分区文件
*
*/
def MapPartitions: Unit ={
val conf = new SparkConf().setAppName("mapPartitions").setMaster("local")
val sc = new SparkContext(conf)
val list = List(1, 2, 3, 4, 5, 6)
val rdd = sc.parallelize(list,2)
rdd.mapPartitions(nn => nn,false).foreach(nb => println(nb))
}
/**
* 结果:
* 1
* 2
* 3
* 18/04/23 21:17:49 INFO TaskSetManager: Starting task 1.0 in stage 0.0 (TID 1, localhost, executor driver, partition 1, PROCESS_LOCAL, 7792 bytes)
* 18/04/23 21:17:49 INFO Executor: Running task 1.0 in stage 0.0 (TID 1)
* 18/04/23 21:17:49 INFO Executor: Finished task 1.0 in stage 0.0 (TID 1). 622 bytes result sent to driver
* 4
* 5
* 6
*/
/**
* repartition算子
* 重新分区
* 即coalesce在输入分区数目P > 分片数目S
* 当shuffle为true,进行shuffle阶段,分区数目为P
*/
def Repartition: Unit ={
val conf = new SparkConf().setAppName("repartition").setMaster("local")
val sc = new SparkContext(conf)
val list = List(1, 2, 3, 4, 5, 6)
val rdd = sc.makeRDD(list,2)
rdd.repartition(3).foreach(nn => println(nn))
}
/**
* 结果:
* 3
* 5
* 18/04/24 18:43:16 INFO Executor: Finished task 0.0 in stage 1.0 (TID 2). 1052 bytes result sent to driver
* 18/04/24 18:43:16 INFO TaskSetManager: Starting task 1.0 in stage 1.0 (TID 3, localhost, executor driver, partition 1, ANY, 7925 bytes)
* 18/04/24 18:43:16 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 2) in 431 ms on localhost (executor driver) (1/3)
* 18/04/24 18:43:16 INFO Executor: Running task 1.0 in stage 1.0 (TID 3)
* 18/04/24 18:43:16 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
* 18/04/24 18:43:16 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 9 ms
* 18/04/24 18:43:16 INFO Executor: Finished task 1.0 in stage 1.0 (TID 3). 966 bytes result sent to driver
* 1
* 6
* 18/04/24 18:43:16 INFO TaskSetManager: Starting task 2.0 in stage 1.0 (TID 4, localhost, executor driver, partition 2, ANY, 7925 bytes)
* 18/04/24 18:43:16 INFO Executor: Running task 2.0 in stage 1.0 (TID 4)
* 18/04/24 18:43:16 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks
* 18/04/24 18:43:16 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
* 18/04/24 18:43:16 INFO TaskSetManager: Finished task 1.0 in stage 1.0 (TID 3) in 332 ms on localhost (executor driver) (2/3)
* 2
* 4
*/
/**
* aggregateByKey算子
* 实现单词计数
* 实现优化合并
* (zeroValue:U)合并初始值
* (seqOp:(U,V)=> U,分区合并
* combOp:(U,U) => U)不同分区合并
*/
def AggregateByKey: Unit ={
val conf = new SparkConf().setAppName("aggregateByKey").setMaster("local")
val sc = new SparkContext(conf)
val list = List("you,jump", "i,jump")
val rdd = sc.parallelize(list)
rdd.flatMap(_.split(","))
.map((_,1))
.aggregateByKey(0)(_ + _ , _ + _)
.foreach(tuple => println(tuple._1 + " -> " + tuple._2))
}
/**
* 结果:
* you -> 1
* jump -> 2
* i -> 1
*/
/**
* coalesce算子
* 分区数由多变少
* 合并分区
* 当输入task数目过多时,用来减少输入的分区数目,用来进行优化
* 1、当输入分区数目 P > 分片数目S
* 当shuffle 为false,coalesce无效,分区数目为S
* 当shuffle 为true,进行shuffle阶段,分区数目为P,此时即repartition
* 2、当输入分区数目P < 分片数目S
* 当P,S数目相差不大时,shuffle设置为false,直接进行按照数目比例进行合并
* 当P,S数目相差较大时,为了保证并行度,将shuffle设置为true
*/
def Coalesce: Unit ={
val conf = new SparkConf().setAppName("coalesce").setMaster("local")
val sc = new SparkContext(conf)
val list = List(1, 2, 3, 4, 5, 6)
sc.makeRDD(list)
.coalesce(1)
.foreach(println(_))
}
/** 结果:
* 1
* 2
* 3
* 4
* 5
* 6
*/
/**
* mapPartitionsWithIndex算子
* 功能与mapPartitions类似,多了参数index,分区编号
* 功能实现,将元组按照分区读取,并添加分区号
*/
def MapPartitionsWithIndex: Unit ={
val conf = new SparkConf().setAppName("mapPartitionsWithIndex").setMaster("local")
val sc = new SparkContext(conf)
val list = List(1, 2, 3, 4, 5, 6, 7, 8)
val rdd = sc.makeRDD(list,2)
rdd.mapPartitionsWithIndex((x,y) => y.map(tuple => (x,tuple) ) )
.foreach(println(_))
}
/**
* 结果:
* (0,1)
* (0,2)
* (0,3)
* (0,4)
* 18/04/23 22:41:29 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 751 bytes result sent to driver
* (1,5)
* (1,6)
* (1,7)
* (1,8)
*/
/**
* cogroup算子
* 协同组
* 分组函数,将不同rdd按照key进行拼接分组,rdd可以为2个或者3个
* 可以指定分区数目,可选项
*/
def Cogroup: Unit ={
val conf = new SparkConf().setAppName("cogroup").setMaster("local")
val sc = new SparkContext(conf)
val list1 = List((1, "东方不败"),(2, "林平之"),(3, "岳不群"),(1, "东方不败"),(2, "林平之"),(3, "岳不群"))
val list2 = List((1, 90),(2, 91),(3, 89),(1, 98),(2, 78),(3, 67))
val rdd1 = sc.makeRDD(list1)
val rdd2 = sc.makeRDD(list2)
val rdd3 = rdd1.cogroup(rdd2)
rdd3.foreach(tuple =>println("ID:" + tuple._1 + " Name: "+ tuple._2._1 + " Scores: "+ tuple._2._2))
}
/**
* 结果:
* ID:1 Name: CompactBuffer(东方不败, 东方不败) Scores: CompactBuffer(90, 98)
* ID:3 Name: CompactBuffer(岳不群, 岳不群) Scores: CompactBuffer(89, 67)
* ID:2 Name: CompactBuffer(林平之, 林平之) Scores: CompactBuffer(91, 78)
*/
/**
* repartitionAndSortWithinPartitions 调优
* 在分区内进行排序,
* 使用说明:需要指定Partitoner,实现2个方法,自定义分区
* 也可以使用new HashPartitioner(2)
* 或者new RangePartitioner(a,b) a是分区数目,b是范围分布的数组对应的rdd
*/
def RepartitionAndSortWithinPartitions: Unit ={
val conf = new SparkConf().setAppName("repartitionAndSortWithinPartitions").setMaster("local")
val sc = new SparkContext(conf)
val list = List(1, 2, 3, 4, 5, 6, 7, 8)
val rdd = sc.parallelize(list)
.map(a=>(a,a)).repartitionAndSortWithinPartitions(new Partitioner() {
override def numPartitions: Int = 2
override def getPartition(key: Any): Int ={
val number = Integer.valueOf(key.toString)
if (number % 2 == 0) return 0
else return 1
}
}).mapPartitionsWithIndex((x,y) => y.map(tuple => (x,tuple._1)))
.foreach(println(_))
}
/**
* 结果:
* (0,2)
* (0,4)
* (0,6)
* (0,8)
* 18/04/24 08:33:00 INFO Executor: Finished task 0.0 in stage 1.0 (TID 1). 1138 bytes result sent to driver
* 18/04/24 08:33:00 INFO TaskSetManager: Starting task 1.0 in stage 1.0 (TID 2, localhost, executor driver, partition 1, ANY, 7649 bytes)
* 18/04/24 08:33:00 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 1) in 107 ms on localhost (executor driver) (1/2)
* 18/04/24 08:33:00 INFO Executor: Running task 1.0 in stage 1.0 (TID 2)
* 18/04/24 08:33:00 INFO ShuffleBlockFetcherIterator: Getting 1 non-empty blocks out of 1 blocks
* 18/04/24 08:33:00 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 1 ms
* (1,1)
* (1,3)
* (1,5)
* (1,7)
*/
/**
* sample算子
* 抽样函数,有三个参数
* withReplacement: Boolean,:是否放回抽样
* fraction: Double,:从数据中抽出样本占整体比值
* seed: Long,:随机种子因子,可以不填
*/
def Sample: Unit ={
val conf = new SparkConf().setAppName("sample").setMaster("local")
val sc = new SparkContext(conf)
val list = List(1, 2, 3, 4, 5, 6, 7,9,10)
sc.makeRDD(list).sample(false,0.5)
.foreach(println(_))
}
/**
* 结果(随机):
* 1
* 2
* 3
* 4
* 6
* 7
*/
def main(args: Array[String]): Unit = {
// Map
// FlatMap
// Filter
// GroupBykey
// ReduceBykey
// SortBykey
// Join
// Union
// Intersection
// Distinct
// Cartesian
// MapPartitions
Repartition
// AggregateByKey
// Coalesce
// MapPartitionsWithIndex
// Cogroup
// RepartitionAndSortWithinPartitions
// Sample
}
}
package sparkcore.day2.lesson01;
import org.apache.spark.HashPartitioner;
import org.apache.spark.Partitioner;
import org.apache.spark.RangePartitioner;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.*;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
/**
*/
public class TransformationOperator {
public static SparkConf conf = new SparkConf().setMaster("local").setAppName("test");
public static JavaSparkContext sc = new JavaSparkContext(conf);
public static void map(){
final List<String> list = Arrays.asList("张无忌", "赵敏", "周芷若");
final JavaRDD<String> rdd = sc.parallelize(list);
final JavaRDD<String> nameRDD = rdd.map(new Function<String, String>() {
@Override
public String call(String name) throws Exception {
return "Hello " + name;
}
});
nameRDD.foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
println(s);
}
});
}
public static void flatMap(){
final List<String> list = Arrays.asList("张无忌 赵敏", "宋青书 周芷若");
final JavaRDD<String> rdd = sc.parallelize(list);
rdd.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String names) throws Exception {
return Arrays.asList(names.split(" ")).iterator();
}
}).map(new Function<String, String>() {
@Override
public String call(String name) throws Exception {
return "Hello "+ name;
}
}).foreach(new VoidFunction<String>() {
@Override
public void call(String line) throws Exception {
println(line);
}
});
}
/**
* 从RDD过滤出来偶数
*/
public static void filter(){
final List<Integer> list = Arrays.asList(1, 2, 3, 4, 5, 6, 7);
final JavaRDD<Integer> rdd = sc.parallelize(list);
final JavaRDD<Integer> filterRDD = rdd.filter(new Function<Integer, Boolean>() {
//true 代表这个值我们要
@Override
public Boolean call(Integer number) throws Exception {
return number % 2 == 0;
}
});
filterRDD.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer integer) throws Exception {
println(integer + "");
}
});
}
/**RDD()
* bykey
*/
public static void groupBykey(){
final List<Tuple2<String, String>> list = Arrays.asList(
new Tuple2<String, String>("峨眉", "周芷若"),
new Tuple2<String, String>("武当", "宋青书"),
new Tuple2<String, String>("峨眉", "灭绝师太"),
new Tuple2<String, String>("武当", "张三丰")
);
final JavaPairRDD<String, String> rdd = sc.parallelizePairs(list);
final JavaPairRDD<String, Iterable<String>> groupBykeyRDD = rdd.groupByKey();
groupBykeyRDD.foreach(new VoidFunction<Tuple2<String, Iterable<String>>>() {
@Override
public void call(Tuple2<String, Iterable<String>> tuple) throws Exception {
final String menpai = tuple._1;
final Iterator<String> iterator = tuple._2.iterator();
println(menpai+ " ");
while (iterator.hasNext()){
final String name = iterator.next();
System.out.print(name);
}
println("");
}
});
}
/**
* 一线城市: 8 年 -》 100万
* 5: 50以上IT
*/
public static void reduceBykey(){
final List<Tuple2<String, Integer>> list = Arrays.asList(
new Tuple2<String, Integer>("峨眉", 40),
new Tuple2<String, Integer>("武当", 30),
new Tuple2<String, Integer>("峨眉",60),
new Tuple2<String, Integer>("武当",99)
);
//reduceBykey
final JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(list);
rdd.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1+v2;
}
}).foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> tuple) throws Exception {
println( tuple._1 + " "+ tuple._2);
}
});
}
public static void sortBykey(){
final List<Tuple2<Integer, String>> list = Arrays.asList(
new Tuple2<Integer, String>(98,"东方不败"),
new Tuple2<Integer, String>(80,"岳不群"),
new Tuple2<Integer, String>(85,"令狐冲"),
new Tuple2<Integer, String>(83,"任我行")
);
final JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(list);
rdd.sortByKey(false)
.foreach(new VoidFunction<Tuple2<Integer, String>>() {
@Override
public void call(Tuple2<Integer, String> tuple) throws Exception {
println(tuple._1 + " -> "+ tuple._2);
}
});
}
public static void join(){
final List<Tuple2<Integer, String>> names = Arrays.asList(
new Tuple2<Integer, String>(1, "东方不败"),
new Tuple2<Integer, String>(2, "令狐冲"),
new Tuple2<Integer, String>(3, "林平之")
);
final List<Tuple2<Integer, Integer>> scores = Arrays.asList(
new Tuple2<Integer, Integer>(1, 99),
new Tuple2<Integer, Integer>(2, 98),
new Tuple2<Integer, Integer>(3, 97)
);
final JavaPairRDD<Integer, String> nemesrdd = sc.parallelizePairs(names);
final JavaPairRDD<Integer, Integer> scoresrdd = sc.parallelizePairs(scores);
/**
* <Integer, 学号
* Tuple2<String, 名字
* Integer>> 分数
*/
final JavaPairRDD<Integer, Tuple2<String, Integer>> joinRDD = nemesrdd.join(scoresrdd);
// final JavaPairRDD<Integer, Tuple2<Integer, String>> join = scoresrdd.join(nemesrdd);
joinRDD.foreach(new VoidFunction<Tuple2<Integer, Tuple2<String, Integer>>>() {
@Override
public void call(Tuple2<Integer, Tuple2<String, Integer>> tuple) throws Exception {
println("学号:" + tuple._1 + " 名字:"+tuple._2._1 + " 分数:"+tuple._2._2);
}
});
}
public static void union(){
final List<Integer> list1 = Arrays.asList(1, 2, 3, 4);
final List<Integer> list2 = Arrays.asList(3, 4, 5, 6);
final JavaRDD<Integer> rdd1 = sc.parallelize(list1);
final JavaRDD<Integer> rdd2 = sc.parallelize(list2);
rdd1.union(rdd2)
.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer number) throws Exception {
println(number + "");
}
});
}
/**
* 交集
*/
public static void intersection(){
final List<Integer> list1 = Arrays.asList(1, 2, 3, 4);
final List<Integer> list2 = Arrays.asList(3, 4, 5, 6);
final JavaRDD<Integer> rdd1 = sc.parallelize(list1);
final JavaRDD<Integer> rdd2 = sc.parallelize(list2);
rdd1.intersection(rdd2)
.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer number) throws Exception {
println(number + "");
}
});
}
public static void distinct(){
final List<Integer> list1 = Arrays.asList(1, 2, 3,3,4,4);
final JavaRDD<Integer> rdd1 = sc.parallelize(list1);
rdd1.distinct()
.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer number) throws Exception {
println(number + " ");
}
});
}
/**
* 笛卡尔积
* A={a,b}
* B={0,1,2}
* A B 笛卡尔积
* a0,a1,a2
* b0,b1,b2
*/
public static void cartesian(){
final List<String> A = Arrays.asList("a", "b");
final List<Integer> B = Arrays.asList(0, 1, 2);
final JavaRDD<String> rddA = sc.parallelize(A);
final JavaRDD<Integer> rddB = sc.parallelize(B);
rddA.cartesian(rddB)
.foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> tuple) throws Exception {
println(tuple._1 + "->"+ tuple._2);
}
});
}
/**
* map:
* 一条数据一条数据的处理(文件系统,数据库等等)
* mapPartitions:
* 一次获取的是一个分区的数据(hdfs)
* 正常情况下,mapPartitions 是一个高性能的算子
* 因为每次处理的是一个分区的数据,减少了去获取数据的次数。
*
* 但是如果我们的分区如果设置得不合理,有可能导致每个分区里面的数据量过大。
*/
public static void mapPartitions(){
final List<Integer> list = Arrays.asList(1, 2, 3, 4, 5, 6);
//参数二代表这个rdd里面有两个分区
final JavaRDD<Integer> rdd = sc.parallelize(list, 2);
rdd.mapPartitions(new FlatMapFunction<Iterator<Integer>, String>() {
//每次处理的是一个分区的数据
@Override
public Iterator<String> call(Iterator<Integer> iterator) throws Exception {
List<String> list=new ArrayList<String> ();
while(iterator.hasNext()){
list.add("hello-" + iterator.next());
}
return list.iterator();
}
}).foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
println(s);
}
});
}
/**
* 进行重分区
* HDFS -》 hello.txt 2个文件块(不包含副本)
* 2个文件块 -》2个分区 -》当spark任务运行,一个分区就启动一个task任务。
*
* 解决的问题:本来分区数少 -》 增加分区数
*/
public static void repartition(){
final List<Integer> list = Arrays.asList(1, 2, 3, 4, 5, 6);
final JavaRDD<Integer> rdd = (JavaRDD<Integer>) sc.parallelize(list, 1);
// coalesce(numPartitions, shuffle = true)
rdd.repartition(2)
.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer number) throws Exception {
println(number+ "");
}
});
}
/**
* 实现单词计数
*/
public static void aggregateByKey(){
final List<String> list = Arrays.asList("you,jump", "i,jump");
final JavaRDD<String> rdd = sc.parallelize(list);
rdd.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String line) throws Exception {
return Arrays.asList(line.split(",")).iterator();
}
}).mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String word) throws Exception {
return new Tuple2<String,Integer>(word,1);
}
}).aggregateByKey(0, new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;//局部
}
}, new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;//全局
}
}
).foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> tuple) throws Exception {
println(tuple._1 + " ->"+ tuple._2);
}
});
}
/**
* 分区数由多 -》 变少
*/
public static void coalesce(){
final List<Integer> list = Arrays.asList(1, 2, 3, 4, 5, 6);
final JavaRDD<Integer> rdd = (JavaRDD<Integer>) sc.parallelize(list, 3);
rdd.coalesce(1)
.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer integer) throws Exception {
println(integer + "");
}
});
}
/**
* map: 每次获取和处理的就是一条数据
* mapParitions: 每次获取和处理的就是一个分区的数据
* mapPartitionsWithIndex:每次获取和处理的就是一个分区的数据,并且知道处理的分区的分区号是啥?
*/
public static void mapPartitionsWithIndex(){
final List<Integer> list = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8);
final JavaRDD<Integer> rdd = sc.parallelize(list, 2);//HashParitioners Rangepartitionw 自定义分区
rdd.mapPartitionsWithIndex(new Function2<Integer, Iterator<Integer>, Iterator<String>>() {
@Override
public Iterator<String> call(Integer index, Iterator<Integer> iterator) throws Exception {
final ArrayList<String> list = new ArrayList<>();
while (iterator.hasNext()){
list.add(index+"_"+ iterator.next());
}
return list.iterator();
}
},true)
.foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
println(s);
}
});
}
/**
* When called on datasets of type (K, V) and (K, W),
* returns a dataset of (K, (Iterable<V>, Iterable<W>)) tuples.
*/
public static void cogroup(){
//sh s sha shan shang sa san sang
final List<Tuple2<Integer, String>> list1 = Arrays.asList(
new Tuple2<Integer, String>(1, "东方不败"),
new Tuple2<Integer, String>(2, "林平之"),
new Tuple2<Integer, String>(3, "岳不群"),
new Tuple2<Integer, String>(1, "东方不败"),
new Tuple2<Integer, String>(2, "林平之"),
new Tuple2<Integer, String>(3, "岳不群")
);
final List<Tuple2<Integer, Integer>> list2 = Arrays.asList(
new Tuple2<Integer, Integer>(1, 90),
new Tuple2<Integer, Integer>(2, 91),
new Tuple2<Integer, Integer>(3, 89),
new Tuple2<Integer, Integer>(1, 98),
new Tuple2<Integer, Integer>(2, 78),
new Tuple2<Integer, Integer>(3, 67)
);
final JavaPairRDD<Integer, String> rdd1 = sc.parallelizePairs(list1);
final JavaPairRDD<Integer, Integer> rdd2 = sc.parallelizePairs(list2);
final JavaPairRDD<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> rdd3 =
(JavaPairRDD<Integer, Tuple2<Iterable<String>, Iterable<Integer>>>) rdd1.cogroup(rdd2);
rdd3.foreach(new VoidFunction<Tuple2<Integer, Tuple2<Iterable<String>, Iterable<Integer>>>>() {
@Override
public void call(Tuple2<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> tuple) throws Exception {
final Integer id = tuple._1;
final Iterable<String> names = tuple._2._1;
final Iterable<Integer> scores = tuple._2._2;
println("ID:"+id + " Name: "+names+ " Scores: "+ scores);
}
});
}
/**
* 少 -》 多
*
*/
public static void repartitionAndSortWithinPartitions(){//调优
final List<Integer> list = Arrays.asList(1, 2, 11, 3, 12, 4, 5);
final JavaRDD<Integer> rdd = sc.parallelize(list, 1);
final JavaPairRDD<Integer, Integer> pairRDD = rdd.mapToPair(new PairFunction<Integer, Integer, Integer>() {
@Override
public Tuple2<Integer, Integer> call(Integer number) throws Exception {
return new Tuple2<>(number, number);
}
});
//new HashPartitioner(2) new RangePartitioner<>()
pairRDD.repartitionAndSortWithinPartitions(new Partitioner() {
@Override
public int numPartitions() {
return 2;
}
@Override
public int getPartition(Object key) {
final Integer number = Integer.valueOf(key.toString());
if(number % 2 == 0){
return 0;
}else{
return 1;
}
}
}).mapPartitionsWithIndex(new Function2<Integer, Iterator<Tuple2<Integer, Integer>>,
Iterator<String>>() {
@Override
public Iterator<String> call(Integer index, Iterator<Tuple2<Integer, Integer>> iterator) throws Exception {
final ArrayList<String> list = new ArrayList<>();
while(iterator.hasNext()){
list.add(index + "_"+ iterator.next());
}
return list.iterator();
}
},false)
.foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
println(s);
}
});
}
/**
* 有放回
* 无放回
*/
public static void sample(){
final List<Integer> list = Arrays.asList(1, 2, 3, 4, 5, 6, 7,9,10);
final JavaRDD<Integer> rdd = sc.parallelize(list);
/**
* withReplacement: Boolean,
* true: 有放回的抽样
* false: 无放回抽象
* fraction: Double:
* RDD 里面的每个元素被抽到的概率有多大
* seed: Long:
* 随机种子
*
*
*/
final JavaRDD<Integer> rdd2 = rdd.sample(false, 0.5);
rdd2.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer integer) throws Exception {
println(integer + "");
}
});
}
public static void pipe(){
final List<Integer> list = Arrays.asList(1, 2, 3, 4, 5, 6, 7,9,10);
final JavaRDD<Integer> rdd = sc.parallelize(list);
// final JavaRDD<String> pipe = rdd.pipe("sh wordcouont.sh");
}
public static void println(String str){
System.out.println(str);
}
public static void main(String[] args) {
//map();
// filter();
// flatMap();
// groupBykey();
// reduceBykey();
// sortBykey();
// join();
// union();
// intersection();
// cartesian();
// mapPartitions();
// repartition();
//coalesce();
// aggregateByKey();
// mapPartitionsWithIndex();
// cogroup();
// repartitionAndSortWithinPartitions();
// sample();
}
}