这个类中有三个重要的方法:
1:getDependencies:获取每个rdd的依赖,遍历rdds 根据每个rdd的分区器 rdd.partitioner=Some(part) 则返回new OneToOneDependency() 宰依赖 否则返回new ShuffleDependency() 宽依赖
2:getPartitions:获取分区器,如果是宽依赖 则返回None,否则返回new NarrowCoGroupSplitDep
3:compute:这个方法最后返回Iterator[(K,Array[Iterable])] 即返回所有k对应的value值,K是分区key,value是一个数组,数组中包含两个迭代器,第一个迭代器是第一个RDD的数据,第二个迭代器是第二个rdd中的数据
3.1 会根据依赖关系是宽依赖还是宰依赖,如果是宰依赖直接 从父RDD的迭代器中获取;如果是ShuffleDependency 则会利用ShuffleManaer中的ShuffleReader读取数据
3.2 方法中处理保存数据使用了createExeternalMap方法这个方法返回一个ExternalAppendOnlyMap;下面来说说createExternalMap方法
3.2.1: 这个方法中第一了三个函数,可以来进行局部聚合,第一个函数 createCombiner是这个key的首条数据来会创建一个数组Array.fill()(new CogGroup) ,但是这个数组中的元素类型是CoGroup这其实是 一个CompactBuffer类
3.2.2 第二个函数mergeValue:当这个key的其他元素来后不会走第一个函数了直接进来第二个函数 会将数据添加到数组中
3.2.3第三个函数 mergeCombiners 是将多个分区的数据进行合并
3.2.4 最后将这个函数作为参数 实力化ExternalAppendOnlyMap
3.2.5 下面来说下CompactBuffler类:这个类有element0 element1 otherElements和 curSize 4个变量,当前两个数据来时候不会去申请数据,而是放入到element0 element1 中,如果元素个数大于等于3的时候回去申请数组(getTwo);会判断下原有容量和申请容量大小关系,如果需要申请会重初始值为8 开始每次乘以2 一直满足申请的容量大小结束,最后将原来的数组中的数据拷贝到新的数组中
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.rdd
import java.io.{IOException, ObjectOutputStream}
import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag
import org.apache.spark._
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.serializer.Serializer
import org.apache.spark.util.Utils
import org.apache.spark.util.collection.{CompactBuffer, ExternalAppendOnlyMap}
/**
* The references to rdd and splitIndex are transient because redundant information is stored
* in the CoGroupedRDD object. Because CoGroupedRDD is serialized separately from
* CoGroupPartition, if rdd and splitIndex aren't transient, they'll be included twice in the
* task closure.
*/
private[spark] case class NarrowCoGroupSplitDep(
@transient rdd: RDD[_],
@transient splitIndex: Int,
var split: Partition
) extends Serializable {
@throws(classOf[IOException])
private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
// Update the reference to parent split at the time of task serialization
split = rdd.partitions(splitIndex)
oos.defaultWriteObject()
}
}
/**
* Stores information about the narrow dependencies used by a CoGroupedRdd.
*
* @param narrowDeps maps to the dependencies variable in the parent RDD: for each one to one
* dependency in dependencies, narrowDeps has a NarrowCoGroupSplitDep (describing
* the partition for that dependency) at the corresponding index. The size of
* narrowDeps should always be equal to the number of parents.
*/
private[spark] class CoGroupPartition(
override val index: Int, val narrowDeps: Array[Option[NarrowCoGroupSplitDep]])
extends Partition with Serializable {
override def hashCode(): Int = index
override def equals(other: Any): Boolean = super.equals(other)
}
/**
* :: DeveloperApi ::
* An RDD that cogroups its parents. For each key k in parent RDDs, the resulting RDD contains a
* tuple with the list of values for that key.
*
* @param rdds parent RDDs.
* @param part partitioner used to partition the shuffle output
*
* @note This is an internal API. We recommend users use RDD.cogroup(...) instead of
* instantiating this directly.
*/
@DeveloperApi
class CoGroupedRDD[K: ClassTag](
@transient var rdds: Seq[RDD[_ <: Product2[K, _]]],
part: Partitioner)
extends RDD[(K, Array[Iterable[_]])](rdds.head.context, Nil) {
// For example, `(k, a) cogroup (k, b)` produces k -> Array(ArrayBuffer as, ArrayBuffer bs).
// Each ArrayBuffer is represented as a CoGroup, and the resulting Array as a CoGroupCombiner.
// CoGroupValue is the intermediate state of each value before being merged in compute.
private type CoGroup = CompactBuffer[Any]
private type CoGroupValue = (Any, Int) // Int is dependency number
private type CoGroupCombiner = Array[CoGroup]
private var serializer: Serializer = SparkEnv.get.serializer
/** Set a serializer for this RDD's shuffle, or null to use the default (spark.serializer) */
def setSerializer(serializer: Serializer): CoGroupedRDD[K] = {
this.serializer = serializer
this
}
override def getDependencies: Seq[Dependency[_]] = {
rdds.map { rdd: RDD[_] =>
// TODO:如果和父RDD的分区器一样 则不会发生shuffle 是one to one
if (rdd.partitioner == Some(part)) {
logDebug("Adding one-to-one dependency with " + rdd)
new OneToOneDependency(rdd)
} else {
logDebug("Adding shuffle dependency with " + rdd)
new ShuffleDependency[K, Any, CoGroupCombiner](
rdd.asInstanceOf[RDD[_ <: Product2[K, _]]], part, serializer)
}
}
}
// TODO:返回分区
override def getPartitions: Array[Partition] = {
val array = new Array[Partition](part.numPartitions)
for (i <- 0 until array.length) {
// Each CoGroupPartition will have a dependency per contributing RDD
// val index: Seq[(RDD[_ <: Product2[K, _]], Int)] = rdds.zipWithIndex
// j 是rdd的下标
array(i) = new CoGroupPartition(i, rdds.zipWithIndex.map { case (rdd, j) =>
// Assume each RDD contributed a single dependency, and get it
dependencies(j) match {
// TODO:如果是宽依赖ShuffleDependency 则返回null
case s: ShuffleDependency[_, _, _] =>
None
case _ =>
// TODO:如果宰依赖 返回NarrowCoGroupSlitDep
Some(new NarrowCoGroupSplitDep(rdd, i, rdd.partitions(i)))
}
}.toArray)
}
array
}
override val partitioner: Some[Partitioner] = Some(part)
/**
* TODO:这个其实就是返回的:所有key对应的数据封装到Array中array中只会包含两个元素 两个Iterable,每个Iterable中存储的是各自流中的数据
*
* @param s
* @param context
* @return
*/
override def compute(s: Partition, context: TaskContext): Iterator[(K, Array[Iterable[_]])] = {
val split = s.asInstanceOf[CoGroupPartition]
val numRdds = dependencies.length
// A list of (rdd iterator, dependency number) pairs
//TODO:这个是集合是 (rdd iterator,dependency number) 第一个元素为数据 第二个元素为父RDD的编号
val rddIterators = new ArrayBuffer[(Iterator[Product2[K, Any]], Int)]
// (dep,depNum) dep是父RDD依赖 depNum是父RDD编号
for ((dep, depNum) <- dependencies.zipWithIndex) dep match {
// TODO:会判断这个是宰依赖
case oneToOneDependency: OneToOneDependency[Product2[K, Any]] @unchecked =>
val dependencyPartition = split.narrowDeps(depNum).get.split
// Read them from the parent //TODO:直接从父RDD读取
//val iterator1: Iterator[Product2[K, Any]] = oneToOneDependency.rdd.iterator(dependencyPartition, context)
val it = oneToOneDependency.rdd.iterator(dependencyPartition, context)
// TODO:如果是宰依赖 会直接从父RDD中获取 从父RDD中读取
rddIterators += ((it, depNum))
// TODO:如果是宽依赖
case shuffleDependency: ShuffleDependency[_, _, _] =>
// Read map outputs of shuffle // TODO:读取map的输出
val metrics = context.taskMetrics().createTempShuffleReadMetrics()
//TODO:如果是宽依赖 会使用ShuffleManager中的ShuffleReader读取数据
val it = SparkEnv.get.shuffleManager
.getReader(
shuffleDependency.shuffleHandle, split.index, split.index + 1, context, metrics)
.read()
// TODO:这里存放的是 每个rdd数据 和这个rdd的下标
rddIterators += ((it, depNum))
}
// TODO:这里可以看到 对 多个父RDD进行和合并
// TODO:numRDDs是rdd的个数
val map: ExternalAppendOnlyMap[K, (Any, Int), CoGroupCombiner] = createExternalMap(numRdds)
// (it, depNum) it 是rdd中的数据,depNum是依赖编号
for ((it, depNum) <- rddIterators) {
// TODO:new CoGroupValue(pair._2, depNum) 保存的是依赖关系
// TODO:添加数据
map.insertAll(it.map(pair => (pair._1, new CoGroupValue(pair._2, depNum))))
}
context.taskMetrics().incMemoryBytesSpilled(map.memoryBytesSpilled)
context.taskMetrics().incDiskBytesSpilled(map.diskBytesSpilled)
context.taskMetrics().incPeakExecutionMemory(map.peakMemoryUsedBytes)
new InterruptibleIterator(context,
map.iterator.asInstanceOf[Iterator[(K, Array[Iterable[_]])]])
}
private def createExternalMap(numRdds: Int)
: ExternalAppendOnlyMap[K, CoGroupValue, CoGroupCombiner] = {
// TODO:这个输入的参数value应该是(data,dependencyNum),即第一个为数据 第二个参数为哪个依赖(哪个父RDD)
val createCombiner: (CoGroupValue => CoGroupCombiner) = value => {
// TODO:初始化一个数组,numRdds为数组的长度,里面的初始值为 CoGroup
// new CoGroup 实际上就是实力化 CompactBuffer
val newCombiner = Array.fill(numRdds)(new CoGroup)
// newCombiner(value._2) 是对应的数组的位置 value._2是rdd的编号
newCombiner(value._2) += value._1 // value._1 是rdd的数据
newCombiner
}
// TODO:这个其实就 这个key的元素在来就添加到这个中
val mergeValue: (CoGroupCombiner, CoGroupValue) => CoGroupCombiner =
(combiner, value) => {
combiner(value._2) += value._1
combiner
}
// TODO:合并
val mergeCombiners: (CoGroupCombiner, CoGroupCombiner) => CoGroupCombiner =
(combiner1, combiner2) => {
var depNum = 0
while (depNum < numRdds) {
combiner1(depNum) ++= combiner2(depNum)
depNum += 1
}
combiner1
}
// TODO:K 就是分区的数据K CoGroupValue是(rddData,depNum) CoGroupCombiner 是 Array[CoGroup]
new ExternalAppendOnlyMap[K, CoGroupValue, CoGroupCombiner](
createCombiner, mergeValue, mergeCombiners)
}
override def clearDependencies(): Unit = {
super.clearDependencies()
rdds = null
}
}
createExternal
private def createExternalMap(numRdds: Int)
: ExternalAppendOnlyMap[K, CoGroupValue, CoGroupCombiner] = {
// TODO:这个输入的参数value应该是(data,dependencyNum),即第一个为数据 第二个参数为哪个依赖(哪个父RDD)
val createCombiner: (CoGroupValue => CoGroupCombiner) = value => {
// TODO:初始化一个数组,numRdds为数组的长度,里面的初始值为 CoGroup
// new CoGroup 实际上就是实力化 CompactBuffer
val newCombiner = Array.fill(numRdds)(new CoGroup)
// newCombiner(value._2) 是对应的数组的位置 value._2是rdd的编号
newCombiner(value._2) += value._1 // value._1 是rdd的数据
newCombiner
}
// TODO:这个其实就 这个key的元素在来就添加到这个中
val mergeValue: (CoGroupCombiner, CoGroupValue) => CoGroupCombiner =
(combiner, value) => {
combiner(value._2) += value._1
combiner
}
// TODO:合并
val mergeCombiners: (CoGroupCombiner, CoGroupCombiner) => CoGroupCombiner =
(combiner1, combiner2) => {
var depNum = 0
while (depNum < numRdds) {
combiner1(depNum) ++= combiner2(depNum)
depNum += 1
}
combiner1
}
// TODO:K 就是分区的数据K CoGroupValue是(rddData,depNum) CoGroupCombiner 是 Array[CoGroup]
new ExternalAppendOnlyMap[K, CoGroupValue, CoGroupCombiner](
createCombiner, mergeValue, mergeCombiners)
}
CompactBuffer
/**
* TODO:CompactBuffer其实就是一个数组,只不过在对数组进行一点改造
* += 是这个类的一个方法,在添加数据时候 如果是第一个元素会放入element0
* 如果是第二个元素会放入到element1 中,但是如果添加的元素大于2个时候会进行申请数组了
* @param value
* @return
*/
def += (value: T): CompactBuffer[T] = {
val newIndex = curSize
if (newIndex == 0) {
element0 = value
curSize = 1
} else if (newIndex == 1) {
element1 = value
curSize = 2
} else {
// TODO:申请数组
growToSize(curSize + 1)
// 添加原属
otherElements(newIndex - 2) = value
}
this
}
getToSize
/** Increase our size to newSize and grow the backing array if needed. */
private def growToSize(newSize: Int): Unit = {
// since two fields are hold in element0 and element1, an array holds newSize - 2 elements
// TODO:减去第一个第二个元素
val newArraySize = newSize - 2
// TODO:最大存储个数 Integer的最大值 -15
val arrayMax = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH
if (newSize < 0 || newArraySize > arrayMax) {
throw new UnsupportedOperationException(s"Can't grow buffer past $arrayMax elements")
}
// TODO:如果之前就存在了数组,则计算这个数组的容量,如果不存在则为0
val capacity = if (otherElements != null) otherElements.length else 0
// TODO:如果发现新申请的大小 大于容量 那么会重基数为8 开始申请 每次乘以2 直到满足 容量
if (newArraySize > capacity) {
var newArrayLen = 8L
while (newArraySize > newArrayLen) {
newArrayLen *= 2
}
if (newArrayLen > arrayMax) {
newArrayLen = arrayMax
}
// TODO:新的数组
val newArray = new Array[T](newArrayLen.toInt)
if (otherElements != null) {
// TODO:如果原来存在数据 会将原来的 拷贝到新的数组中
System.arraycopy(otherElements, 0, newArray, 0, otherElements.length)
}
otherElements = newArray
}
curSize = newSize
}
}