概述:
1.sparkContext初始化很重要,因为他是Driver应用程序提交执行的前提,只有sparkContext初始化后才可以Driver提交用户应用程序,也就是说spark driver的初始化围绕着sparkContext初始化展开的,SparkContext可以算是spark应用程序的发动机引擎
2.源码研究以local模式为主
3.Spark中的组件很多,就其功能而言涉及网络通信、分布式、消息、存储、计算、缓存、测量、清理、文件服务、Web UI的方方面面。
4.SparkContext初始化的步骤如下:
0)SparkConf:是SparkContext的配置参数,相当于控制版面
1)创建Spark执行环境SparkEnv;
2)创建RDD清理器metadataCleaner;
3)创建并初始化Spark UI;
4)Hadoop相关配置及Executor环境变量的设置; 5)创建任务调度TaskScheduler;
6)创建和启动DAGScheduler; 7)TaskScheduler的启动;
8)初始化块管理器BlockManager(BlockManager是存储体系的主要组件之一,将在第4章介绍);
9)启动测量系统MetricsSystem;
10)创建和启动Executor分配管理器ExecutorAllocationManager;
11)ContextCleaner的创建与启动;
12)Spark环境更新;
13)创建DAGSchedulerSource和BlockManagerSource;
14)将SparkContext标记为激活。
正文
一、控制版面SparkConf
1.代码结构
class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
import SparkConf._
def this() = this(true)
private val settings = new ConcurrentHashMap[String, String]()
if (loadDefaults) {
// 加载任何以spark.开头的系统属性
for ((key, value) <- Utils.getSystemProperties if key.startsWith("spark.")) {
set(key, value)
}
}
//其余代码省略
Utils.getSystemProperties获取的系统属性,显然SparkConf是加载系统属性里面以“spark.”为开头的属性。
SparkContext的主构造器参数为SparkConf,其实现如下
class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationClient {
//获取当前SparkContext的当前调用栈。包含了最靠近栈顶的用户类及最靠近栈底的Scala或者Spark核心类信息
private val creationSite: CallSite = Utils.getCallSite()
private val allowMultipleContexts: Boolean =
config.getBoolean("spark.driver.allowMultipleContexts", false)
SparkContext.markPartiallyConstructed(this, allowMultipleContexts)
Utils.getCallSite()
//skipClass: String => Boolean = sparkInternalExclusionFunction 采用了柯里化
def getCallSite(skipClass: String => Boolean = sparkInternalExclusionFunction): CallSite = {
var lastSparkMethod = "<unknown>"
var firstUserFile = "<unknown>"
var firstUserLine = 0
var insideSpark = true
val callStack = new ArrayBuffer[String]() :+ "<unknown>"
Thread.currentThread.getStackTrace().foreach { ste: StackTraceElement =>
if (ste != null && ste.getMethodName != null
&& !ste.getMethodName.contains("getStackTrace")) {
if (insideSpark) {
if (skipClass(ste.getClassName)) {
lastSparkMethod = if (ste.getMethodName == "<init>") {
// Spark method is a constructor; get its class name
ste.getClassName.substring(ste.getClassName.lastIndexOf('.') + 1)
} else {
ste.getMethodName
}
callStack(0) = ste.toString // Put last Spark method on top of the stack trace.
} else {
if (ste.getFileName != null) {
//方法没有参数可以省略() 如ste.getFileName ste.getLineNumber
firstUserFile = ste.getFileName
if (ste.getLineNumber >= 0) {
firstUserLine = ste.getLineNumber
}
}
callStack += ste.toString
insideSpark = false
}
} else {
callStack += ste.toString
}
}
}
val callStackDepth = System.getProperty("spark.callstack.depth", "20").toInt
val shortForm =
if (firstUserFile == "HiveSessionImpl.java") {
"Spark JDBC Server Query"
} else {
s"$lastSparkMethod at $firstUserFile:$firstUserLine"
}
//take取集合的前n个元素 mkString表示将集合元素合并成一个字符串
val longForm = callStack.take(callStackDepth).mkString("\n")
CallSite(shortForm, longForm)
}
sparkInternalExclusionFunction
private def sparkInternalExclusionFunction(className: String): Boolean = {
// A regular expression to match classes of the internal Spark API's
// that we want to skip when finding the call site of a method.
val SPARK_CORE_CLASS_REGEX =
"""^org\.apache\.spark(\.api\.java)?(\.util)?(\.rdd)?(\.broadcast)?\.[A-Z]""".r
val SPARK_SQL_CLASS_REGEX = """^org\.apache\.spark\.sql.*""".r
val SCALA_CORE_CLASS_PREFIX = "scala"
val isSparkClass = SPARK_CORE_CLASS_REGEX.findFirstIn(className).isDefined ||
SPARK_SQL_CLASS_REGEX.findFirstIn(className).isDefined
val isScalaClass = className.startsWith(SCALA_CORE_CLASS_PREFIX)
// If the class is a Spark internal class or a Scala class, then exclude.
isSparkClass || isScalaClass
}
CallSite类
private[spark] case class CallSite(shortForm: String, longForm: String)
private[spark] object CallSite {
val SHORT_FORM = "callSite.short"
val LONG_FORM = "callSite.long"
}
getCallSite
功能描述:获取当前SparkContext的当前调用堆栈,将栈里最靠近栈底的属于spark或者Scala核心的类压入callStack的栈顶,并将此类的方法存入lastSparkMethod;将栈里最靠近栈顶的用户类放入callStack,将此类的行号存入firstUserLine,类名存入firstUserFile,最终返回的样例类CallSite存储了最短栈和长度默认为20的最长栈的样例类。
但是通过看代码不符合逻辑(有可能是自己的能力有限)
分析SparkContext.markPartiallyConstructed(this, allowMultipleContexts)
作用:SparkContext标记为正在构建中
代码:
private[spark] def markPartiallyConstructed(
sc: SparkContext,
allowMultipleContexts: Boolean): Unit = {
SPARK_CONTEXT_CONSTRUCTOR_LOCK.synchronized {
//标记前进行验证 是否有其他context在运行
assertNoOtherContextIsRunning(sc, allowMultipleContexts)
//由于contextBeingConstructed 是option[]类型的 需要子类some()或者none赋值
//contextBeingConstructed 被赋值标志:将当前SparkContext标记为正在构建中
contextBeingConstructed = Some(sc)
}
}
private def assertNoOtherContextIsRunning(
sc: SparkContext,
allowMultipleContexts: Boolean): Unit = {
SPARK_CONTEXT_CONSTRUCTOR_LOCK.synchronized {
//判读是否有其他sparkcontext被标记为构建中,如果有提出warning
contextBeingConstructed.foreach { otherContext =>
if (otherContext ne sc) { // checks for reference equality
// Since otherContext might point to a partially-constructed context, guard against
// its creationSite field being null:
val otherContextCreationSite =
Option(otherContext.creationSite).map(_.longForm).getOrElse("unknown location")
val warnMsg = "Another SparkContext is being constructed (or threw an exception in its" +
" constructor). This may indicate an error, since only one SparkContext may be" +
" running in this JVM (see SPARK-2243)." +
s" The other SparkContext was created at:\n$otherContextCreationSite"
logWarning(warnMsg)
}
//判断是否有sparkcontext正在生效,如果有生效的,再判断allowMultipleContexts是否允许,如果不允许直接抛出异常
if (activeContext.get() != null) {
val ctx = activeContext.get()
val errMsg = "Only one SparkContext may be running in this JVM (see SPARK-2243)." +
" To ignore this error, set spark.driver.allowMultipleContexts = true. " +
s"The currently running SparkContext was created at:\n${ctx.creationSite.longForm}"
val exception = new SparkException(errMsg)
if (allowMultipleContexts) {
logWarning("Multiple running SparkContexts detected in the same JVM!", exception)
} else {
throw exception
}
}
}
}
}
接下来会对SparkConf进行复制,然后对各种配置信息进行校验,代码如下。
private[spark] val conf = config.clone()
//检查非法或不赞成的配置设置
conf.validateSettings()
if (!conf.contains("spark.master")) {
throw new SparkException("A master URL must be set in your configuration")
}
if (!conf.contains("spark.app.name")) {
throw new SparkException("An application name must be set in your configuration")
从上面校验的代码看到必须指定属性spark.master和spark.app.name,否则会抛出异常,结束初始化过程。spark.master用于设置部署模式,spark.app.name用于指定应用程序名称。
conf.validateSettings()代码如下
private[spark] def validateSettings() {
if (contains("spark.local.dir")) {
val msg = "In Spark 1.0 and later spark.local.dir will be overridden by the value set by " +
"the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone and LOCAL_DIRS in YARN)."
logWarning(msg)
}
val executorOptsKey = "spark.executor.extraJavaOptions"
val executorClasspathKey = "spark.executor.extraClassPath"
val driverOptsKey = "spark.driver.extraJavaOptions"
val driverClassPathKey = "spark.driver.extraClassPath"
val driverLibraryPathKey = "spark.driver.extraLibraryPath"
val sparkExecutorInstances = "spark.executor.instances"
// Used by Yarn in 1.1 and before
sys.props.get("spark.driver.libraryPath").foreach { value =>
val warning =
s"""
|spark.driver.libraryPath was detected (set to '$value').
|This is deprecated in Spark 1.2+.
|
|Please instead use: $driverLibraryPathKey
""".stripMargin
logWarning(warning)
}
// Validate spark.executor.extraJavaOptions
getOption(executorOptsKey).map { javaOpts =>
if (javaOpts.contains("-Dspark")) {
val msg = s"$executorOptsKey is not allowed to set Spark options (was '$javaOpts'). " +
"Set them directly on a SparkConf or in a properties file when using ./bin/spark-submit."
throw new Exception(msg)
}
if (javaOpts.contains("-Xmx") || javaOpts.contains("-Xms")) {
val msg = s"$executorOptsKey is not allowed to alter memory settings (was '$javaOpts'). " +
"Use spark.executor.memory instead."
throw new Exception(msg)
}
}
// Validate memory fractions
val memoryKeys = Seq(
"spark.storage.memoryFraction",
"spark.shuffle.memoryFraction",
"spark.shuffle.safetyFraction",
"spark.storage.unrollFraction",
"spark.storage.safetyFraction")
for (key <- memoryKeys) {
val value = getDouble(key, 0.5)
if (value > 1 || value < 0) {
throw new IllegalArgumentException("$key should be between 0 and 1 (was '$value').")
}
}
// Check for legacy configs
sys.env.get("SPARK_JAVA_OPTS").foreach { value =>
val warning =
s"""
|SPARK_JAVA_OPTS was detected (set to '$value').
|This is deprecated in Spark 1.0+.
|
|Please instead use:
| - ./spark-submit with conf/spark-defaults.conf to set defaults for an application
| - ./spark-submit with --driver-java-options to set -X options for a driver
| - spark.executor.extraJavaOptions to set -X options for executors
| - SPARK_DAEMON_JAVA_OPTS to set java options for standalone daemons (master or worker)
""".stripMargin
logWarning(warning)
for (key <- Seq(executorOptsKey, driverOptsKey)) {
if (getOption(key).isDefined) {
throw new SparkException(s"Found both $key and SPARK_JAVA_OPTS. Use only the former.")
} else {
logWarning(s"Setting '$key' to '$value' as a work-around.")
set(key, value)
}
}
}
sys.env.get("SPARK_CLASSPATH").foreach { value =>
val warning =
s"""
|SPARK_CLASSPATH was detected (set to '$value').
|This is deprecated in Spark 1.0+.
|
|Please instead use:
| - ./spark-submit with --driver-class-path to augment the driver classpath
| - spark.executor.extraClassPath to augment the executor classpath
""".stripMargin
logWarning(warning)
for (key <- Seq(executorClasspathKey, driverClassPathKey)) {
if (getOption(key).isDefined) {
throw new SparkException(s"Found both $key and SPARK_CLASSPATH. Use only the former.")
} else {
logWarning(s"Setting '$key' to '$value' as a work-around.")
set(key, value)
}
}
}
if (!contains(sparkExecutorInstances)) {
sys.env.get("SPARK_WORKER_INSTANCES").foreach { value =>
val warning =
s"""
|SPARK_WORKER_INSTANCES was detected (set to '$value').
|This is deprecated in Spark 1.0+.
|
|Please instead use:
| - ./spark-submit with --num-executors to specify the number of executors
| - Or set SPARK_EXECUTOR_INSTANCES
| - spark.executor.instances to configure the number of instances in the spark config.
""".stripMargin
logWarning(warning)
set("spark.executor.instances", value)
}
}
}