public JobStatus submitJob(JobID jobId, String jobSubmitDir, Credentials ts)
throws IOException {
JobInfo jobInfo = null;
UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
synchronized (this) {
if (jobs.containsKey(jobId)) {
// job already running, don't start twice
return jobs.get(jobId).getStatus();
jobInfo = new JobInfo(jobId, new Text(ugi.getShortUserName()),
new Path(jobSubmitDir));
// Create the JobInProgress, do not lock the JobTracker since
// we are about to copy job.xml from HDFS
JobInProgress job = null;
try {
job = new JobInProgress(this, this.conf, jobInfo, 0, ts);
} catch (Exception e) {
throw new IOException(e);
synchronized (this) {
// 检测队列是否在运行,默认队列名为default
String queue = job.getProfile().getQueueName();
if (!queueManager.isRunning(queue)) {
throw new IOException("Queue \"" + queue + "\" is not running");
try {
aclsManager.checkAccess(job, ugi, Operation.SUBMIT_JOB);
} catch (IOException ioe) {
LOG.warn("Access denied for user " + job.getJobConf().getUser()
+ ". Ignoring job " + jobId, ioe);
throw ioe;
// 检查作业的内存配置
try {
} catch (IOException ioe) {
throw ioe;
boolean recovered = true; // TODO: Once the Job recovery code is there,
// (MAPREDUCE-873) we
// must pass the "recovered" flag accurately.
// This is handled in the trunk/0.22
if (!recovered) {
// Store the information in a file so that the job can be recovered
// later (if at all)
Path jobDir = getSystemDirectoryForJob(jobId);
FileSystem.mkdirs(fs, jobDir, new FsPermission(SYSTEM_DIR_PERMISSION));
FSDataOutputStream out = fs.create(getSystemFileForJob(jobId));
// 提交作业,返回作业状态
JobStatus status;
try {
status = addJob(jobId, job);
} catch (IOException ioe) {
LOG.info("Job " + jobId + " submission failed!", ioe);
status = job.getStatus();
throw ioe;
return status;
JobInProgress(JobTracker jobtracker, final JobConf default_conf,
JobInfo jobInfo, int rCount, Credentials ts)
throws IOException, InterruptedException {
try {
this.restartCount = rCount;//重启次数
this.jobId = JobID.downgrade(jobInfo.getJobID());//获得jobid
String url = "http://" + jobtracker.getJobTrackerMachine() + ":"
+ jobtracker.getInfoPort() + "/jobdetails.jsp?jobid=" + jobId;
this.jobtracker = jobtracker;
this.status = new JobStatus(jobId, 0.0f, 0.0f, JobStatus.PREP);
this.jobtracker.getInstrumentation().addPrepJob(conf, jobId);
// 设置启动时间
this.startTime = jobtracker.getClock().getTime();
this.localFs = jobtracker.getLocalFileSystem();
this.tokenStorage = ts;
// 获得提交目录
jobSubmitDir = jobInfo.getJobSubmitDir();
user = jobInfo.getUser().toString();
userUGI = UserGroupInformation.createRemoteUser(user);
if (ts != null) {
for (Token<? extends TokenIdentifier> token : ts.getAllTokens()) {
fs = userUGI.doAs(new PrivilegedExceptionAction<FileSystem>() {
public FileSystem run() throws IOException {
return jobSubmitDir.getFileSystem(default_conf);
Path submitJobFile = JobSubmissionFiles.getJobConfPath(jobSubmitDir);
FileStatus fstatus = fs.getFileStatus(submitJobFile);
if (fstatus.getLen() > jobtracker.MAX_JOBCONF_SIZE) {
throw new IOException("Exceeded max jobconf size: "
+ fstatus.getLen() + " limit: " + jobtracker.MAX_JOBCONF_SIZE);
this.localJobFile = default_conf.getLocalPath(JobTracker.SUBDIR
+"/"+jobId + ".xml");
Path jobFilePath = JobSubmissionFiles.getJobConfPath(jobSubmitDir);
jobFile = jobFilePath.toString();
fs.copyToLocalFile(jobFilePath, localJobFile);
conf = new JobConf(localJobFile);
if (conf.getUser() == null) {
if (!conf.getUser().equals(user)) {
String desc = "The username " + conf.getUser() + " obtained from the " +
"conf doesn't match the username " + user + " the user " +
"authenticated as";
AuditLogger.logFailure(user, Operation.SUBMIT_JOB.name(), conf.getUser(),
jobId.toString(), desc);
throw new IOException(desc);
this.priority = conf.getJobPriority();
String queueName = conf.getQueueName();
this.profile = new JobProfile(user, jobId,
jobFile, url, conf.getJobName(), queueName);
Queue queue = this.jobtracker.getQueueManager().getQueue(queueName);
if (queue == null) {
throw new IOException("Queue \"" + queueName + "\" does not exist");
this.queueMetrics = queue.getMetrics();
this.queueMetrics.addPrepJob(conf, jobId);
this.submitHostName = conf.getJobSubmitHostName();
this.submitHostAddress = conf.getJobSubmitHostAddress();
this.numMapTasks = conf.getNumMapTasks();
this.numReduceTasks = conf.getNumReduceTasks();
this.memoryPerMap = conf.getMemoryForMapTask();
this.memoryPerReduce = conf.getMemoryForReduceTask();
this.taskCompletionEvents = new ArrayList<TaskCompletionEvent>
(numMapTasks + numReduceTasks + 10);
// Construct the jobACLs
this.mapFailuresPercent = conf.getMaxMapTaskFailuresPercent();
this.reduceFailuresPercent = conf.getMaxReduceTaskFailuresPercent();
this.maxTaskFailuresPerTracker = conf.getMaxTaskFailuresPerTracker();
hasSpeculativeMaps = conf.getMapSpeculativeExecution();
hasSpeculativeReduces = conf.getReduceSpeculativeExecution();
// a limit on the input size of the reduce.
// we check to see if the estimated input size of
// of each reduce is less than this value. If not
// we fail the job. A value of -1 just means there is no
// limit set.
reduce_input_limit = -1L;
this.maxLevel = jobtracker.getNumTaskCacheLevels();
this.anyCacheLevel = this.maxLevel+1;
this.nonLocalMaps = new LinkedList<TaskInProgress>();
this.failedMaps = new TreeSet<TaskInProgress>(failComparator);
this.nonLocalRunningMaps = new LinkedHashSet<TaskInProgress>();
this.runningMapCache = new IdentityHashMap<Node, Set<TaskInProgress>>();
this.nonRunningReduces = new TreeSet<TaskInProgress>(failComparator);
this.runningReduces = new LinkedHashSet<TaskInProgress>();
this.resourceEstimator = new ResourceEstimator(this);
this.reduce_input_limit = conf.getLong("mapreduce.reduce.input.limit",
// register job's tokens for renewal
jobInfo.getJobID(), ts, jobtracker.getConf());
// 最大任务数校验
} finally {
//close all FileSystems that was created above for the current user
//At this point, this constructor is called in the context of an RPC, and
//hence the "current user" is actually referring to the kerberos
//authenticated user (if security is ON).
下面介绍一个初始化的job是如何加入到监控和初始化队列中的,在submitJob函数中,创建JobInProgress后会提交该job:status = addJob(jobId, job);队列的添加在这行代码中完成
private synchronized JobStatus addJob(JobID jobId, JobInProgress job)
throws IOException {
synchronized (jobs) {
synchronized (taskScheduler) {
jobs.put(job.getProfile().getJobID(), job);
for (JobInProgressListener listener : jobInProgressListeners) {
myInstrumentation.submitJob(job.getJobConf(), jobId);
job.getQueueMetrics().submitJob(job.getJobConf(), jobId);
LOG.info("Job " + jobId + " added successfully for user '"
+ job.getJobConf().getUser() + "' to queue '"
+ job.getJobConf().getQueueName() + "'");
Operation.SUBMIT_JOB.name(), jobId.toString());
return job.getStatus();