Spark 自定义累加器解析_big data

  1. scala实现自定义累加器
package examples

import org.apache.spark.util.AccumulatorV2
import org.apache.spark.{SparkConf, SparkContext}

/**
* @Author yqq
* @Date 2021/12/12 14:36
* @Version 1.0
*/
case class Info(var personCount:Int,var ageCount:Int)
class MyAccumulator extends AccumulatorV2[Info,Info]{
private var info = Info(10,10)
//判断每个分区中累加器对象是否是初始值,需要与reset保持一致
override def isZero: Boolean = {info.personCount == 0 && info.ageCount == 0}
//复制累加器
override def copy(): AccumulatorV2[Info, Info] = {
val myAccumulator = new MyAccumulator
myAccumulator.info = this.info
myAccumulator
}
//给每个RDD分区中的累加器进行初始值设置
override def reset(): Unit = {info = Info(0,0)}
//在每个RDD分区中累加处理
override def add(v: Info): Unit = {
info.personCount += v.personCount
info.ageCount += v.ageCount
}
//将每个分区的结果与Driver端的自定义累加器对象进行累加
override def merge(other: AccumulatorV2[Info, Info]): Unit = {
var ac = other.asInstanceOf[MyAccumulator]
info.personCount += ac.info.personCount
info.ageCount += ac.info.ageCount
}
//返回累加器最终返回的结果值对象
override def value: Info = info
}
object SelfDefinedAccumulatorTest1 {
def main(args: Array[String]): Unit = {
val context = new SparkContext(
new SparkConf()
.setMaster("local")
.setAppName("test111")
)
context.setLogLevel("Error")
val acc = new MyAccumulator
context.register(acc)
context.parallelize(Array[String]("A 1", "B 2", "C 3", "D 4", "E 5", "F 6", "H 7"))
.map(e=>{
val age = e.split(" ")(1).toInt
acc.add(Info(1,age))
e
}).collect()
println(s"acc value = ${acc.value}")
}
}

Spark 自定义累加器解析_yacc_02
2. java实现自定义累加器

package examples;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.spark.util.AccumulatorV2;

import java.io.Serializable;

/**
* @Author yqq
* @Date 2021/12/12 13:44
* @Version 1.0
*/
@Data
@AllArgsConstructor
@NoArgsConstructor
class Person implements Serializable {
private int personCount;
private int ageCount;
}
public class SelfAccumulator extends AccumulatorV2<Person,Person> {
private Person p1 = new Person(0,0);
/**
* 判断RDD中每个分区中的累加器对象是否是初始值,需要与reset设置的值保持一致
* @return
*/
@Override
public boolean isZero() {
return p1.getAgeCount() == 0 && p1.getPersonCount() == 0;
}

/**
* copy 是复制累加器,返回一个自定义累加器
* @return
*/
@Override
public AccumulatorV2<Person, Person> copy() {
SelfAccumulator accumulatorV2 = new SelfAccumulator();
accumulatorV2.p1 = this.p1;
return accumulatorV2;
}

/**
* 给RDD中的每个分区对象做初始化
*/
@Override
public void reset() {
p1 = new Person(0,0);
}

/**
* 作用在RDD的每个分区中做累加
* @param v
*/
@Override
public void add(Person v) {
p1.setPersonCount(p1.getPersonCount()+v.getPersonCount());
p1.setAgeCount(p1.getAgeCount()+v.getAgeCount());
}

/**
* 将每个分区的结果累加
* @param other
*/
@Override
public void merge(AccumulatorV2<Person, Person> other) {
SelfAccumulator sa = (SelfAccumulator) other;
p1.setPersonCount(p1.getPersonCount()+sa.p1.getPersonCount());
p1.setAgeCount(p1.getAgeCount()+sa.p1.getAgeCount());
}

/**
* 最终累加器返回的结果
* @return
*/
@Override
public Person value() {
return this.p1;
}
}

//测试
package examples;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;

import java.util.Arrays;

/**
* @Author yqq
* @Date 2021/12/12 13:34
* @Version 1.0
*/
public class SelfDefinedAccumulatorTest {
public static void main(String[] args) {
JavaSparkContext context = new JavaSparkContext(
new SparkConf()
.setMaster("local")
.setAppName("test")
);
context.setLogLevel("Error");
SelfAccumulator sa = new SelfAccumulator();
context.sc().register(sa);
/**
* 自定义累加器 累加人数和总年龄
*/
context.parallelize(Arrays.asList(
"A 1","B 2","C 3","D 4","E 5","F 6","H 7"
),3).map(e->{
Integer ageValue = Integer.valueOf(e.split(" ")[1]);
sa.add(new Person(1,ageValue));
return e;
}).count();
System.out.println(sa);
}
}

Spark 自定义累加器解析_scala_03