

val NONE = new StorageLevel( false , false , false , false )

val DISK _ ONLY = new StorageLevel( true , false , false , false )

val DISK _ ONLY _ 2 = new StorageLevel( true , false , false , false , 2 )

val MEMORY _ ONLY = new StorageLevel( false , true , false , true )

val MEMORY _ ONLY _ 2 = new StorageLevel( false , true , false , true , 2 )

val MEMORY _ ONLY _ SER = new StorageLevel( false , true , false , false )

val MEMORY _ ONLY _ SER _ 2 = new StorageLevel( false , true , false , false , 2 )

val MEMORY _ AND _ DISK = new StorageLevel( true , true , false , true )

val MEMORY _ AND _ DISK _ 2 = new StorageLevel( true , true , false , true , 2 )

val MEMORY _ AND _ DISK _ SER = new StorageLevel( true , true , false , false )

val MEMORY _ AND _ DISK _ SER _ 2 = new StorageLevel( true , true , false , false , 2 )

val OFF _ HEAP = new StorageLevel( false , false , true , false )





private def persist(newLevel : StorageLevel, allowOverride : Boolean) : this . type = {

// TODO: Handle changes of StorageLevel

if (storageLevel ! = StorageLevel.NONE && newLevel ! = storageLevel && !allowOverride) {

throw new UnsupportedOperationException(

"Cannot change storage level of an RDD after it was already assigned a level" )


// If this is the first time this RDD is marked for persisting, register it

// with the <span ><a href="" title="" target="_blank" data-original-title="View all posts in Spark">Spark</a></span>Context for cleanups and accounting. Do this only once.

if (storageLevel == StorageLevel.NONE) {

sc.cleaner.foreach( _ .registerRDDForCleanup( this ))

sc.persistRDD( this )


storageLevel = newLevel




final def iterator(split : Partition, context : TaskContext) : Iterator[T] = {

if (storageLevel ! = StorageLevel.NONE) {

< span class = "wp_keywordlink_affiliate" >< a href = "" title = "" target = "_blank" data-original-title = "View all posts in Spark" > Spark < /a >< /span > Env.get.cacheManager.getOrCompute( this , split, context, storageLevel)

} else {

computeOrReadCheckpoint(split, context)




def getOrCompute[T](

rdd : RDD[T],

partition : Partition,

context : TaskContext,

storageLevel : StorageLevel) : Iterator[T] = {


val key = RDDBlockId(, partition.index)

logDebug(s "Looking for partition $key" )

blockManager.get(key) match {

case Some(blockResult) = >

// Partition is already materialized, so just return its values

val existingMetrics = context.taskMetrics




val iter =[Iterator[T]]

new InterruptibleIterator[T](context, iter) {

override def next() : T = {

existingMetrics.incRecordsRead( 1 )



case None = >

// Acquire a lock for loading this partition

// If another thread already holds the lock, wait for it to finish return its results

val storedValues = acquireLockForPartition[T](key)

if (storedValues.isDefined) {

return new InterruptibleIterator[T](context, storedValues.get)



// Otherwise, we have to load the partition ourselves

try {

logInfo(s "Partition $key not found, computing it" )

val computedValues = rdd.computeOrReadCheckpoint(partition, context)


// If the task is running locally, do not persist the result

if (context.isRunningLocally) {

return computedValues



// Otherwise, cache the values and keep track of any updates in block statuses

val updatedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]

val cachedValues = putInBlockManager(key, computedValues, storageLevel, updatedBlocks)

val metrics = context.taskMetrics

val lastUpdatedBlocks = metrics.updatedBlocks.getOrElse(Seq[(BlockId, BlockStatus)]())

metrics.updatedBlocks = Some(lastUpdatedBlocks ++ updatedBlocks.toSeq)

new InterruptibleIterator(context, cachedValues)


} finally {

loading.synchronized {







rdd.computeOrReadCheckpoint(partition, context)计算当前分区的数据,并放计算完的数据放到BlockManager中,如果有相关的线程等待该分区的计算,那么在计算完数据之后还得通知它们(loading.notifyAll())。



private def acquireLockForPartition[T](id : RDDBlockId) : Option[Iterator[T]] = {

loading.synchronized {

if (!loading.contains(id)) {

// If the partition is free, acquire its lock to compute its value



} else {

// Otherwise, wait for another thread to finish and return its result

logInfo(s "Another thread is loading $id, waiting for it to finish..." )

while (loading.contains(id)) {

try {


} catch {

case e : Exception = >

logWarning(s "Exception while waiting for another thread to load $id" , e)



logInfo(s "Finished waiting for $id" )

val values = blockManager.get(id)

if (!values.isDefined) {

/* The block is not guaranteed to exist even after the other thread has finished.

* For instance, the block could be evicted after it was put, but before our get.

* In this case, we still need to load the partition ourselves. */

logInfo(s "Whoever was loading $id failed; we'll try it ourselves" )


} _ .data.asInstanceOf[Iterator[T]])




new InterruptibleIterator[T](context, storedValues.get)获取已经缓存的数据。以后后续RDD需要这个RDD的数据我们就可以直接在缓存中获取了,而不需要再计算了。后面我会对checkpoint相关代码进行分析。
