coalesce 常用来减少分区,第二个参数是减少分区的过程中是否产生 shuffle。 true 为产生 shuffle,false 不产生 shuffle。默认是 false。 如果 coalesce 设置的分区数比原来的 RDD 的分区数还多的话,第二个参数设置为 false 不 会 起 作 用 , 如 果 设 置 成 true , 效 果 和 repartition 一 样 。 即 repartition(numPartitions) = coalesce(numPartitions,true)

  1. java
package transformations;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

/**
* @Author yqq
* @Date 2021/12/09 23:16
* @Version 1.0
*/
public class CoalesceTest {
public static void main(String[] args) {
JavaSparkContext context = new JavaSparkContext(
new SparkConf()
.setMaster("local")
.setAppName("coalesce")
);
context.setLogLevel("Error");
List<String> list = new ArrayList<>();
context.parallelize(Arrays.asList(
"yqq1","yqq2","yqq3","yqq4",
"yqq5","yqq6","yqq7","yqq8",
"yqq9","yqq10","yqq11","yqq12"
),3).mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {
@Override
public Iterator<String> call(Integer v1, Iterator<String> v2) throws Exception {
while (v2.hasNext())
list.add("repartition partition index = "+"【"+v1+"】"+",value = "+"【"+v2.next()+"】");
return list.iterator();
}
},false).coalesce(4,true).mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {
@Override
public Iterator<String> call(Integer v1, Iterator<String> v2) throws Exception {
ArrayList<String> list1 = new ArrayList<>();
while (v2.hasNext())
list1.add("coalesce rdd partition index = "+"【"+v1+"】"+",value = "+"【"+v2.next()+"】");
return list1.iterator();
}
},false).collect().forEach(e-> System.out.println(e));
}
}

Spark Transformation算子->coalesce_java
2. scala

package transformation

import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable.ListBuffer

/**
* @Author yqq
* @Date 2021/12/09 23:40
* @Version 1.0
*/
object CoalesceTest {
def main(args: Array[String]): Unit = {
val context = new SparkContext(
new SparkConf()
.setMaster("local")
.setAppName("coalesce")
)
context.setLogLevel("Error")
context.parallelize(Array[String](
"yqq1", "yqq2", "yqq3", "yqq4",
"yqq5", "yqq6", "yqq7", "yqq8",
"yqq9", "yqq10", "yqq11", "yqq12"
),3).coalesce(5).mapPartitionsWithIndex((index,iter)=>{
val buffer = new ListBuffer[String]()
while (iter.hasNext)
buffer.append(s"partition index = $index,value = ${iter.next()}")
buffer.iterator
}).collect().foreach(println)
}
}

Spark Transformation算子->coalesce_java_02