HDFS源码之NameNode 启动流程

1. start-dfs.sh

"$HADOOP_PREFIX/sbin/hadoop-daemons.sh" 
  --config "$HADOOP_CONF_DIR" \
  --hostnames "$NAMENODES" \
  --script "$bin/hdfs" start namenode $nameStartOpt

2. hadoop-daemons.sh

"$bin/slaves.sh" --config $HADOOP_CONF_DIR cd "$HADOOP_PREFIX" \; "$bin/hadoop-daemon.sh" --config $HADOOP_CONF_DIR "$@"

3. hadoop-daemon.sh

case $command in
      namenode|secondarynamenode|datanode|journalnode|dfs|dfsadmin|fsck|balancer|zkfc)
        if [ -z "$HADOOP_HDFS_HOME" ]; then
          hdfsScript="$HADOOP_PREFIX"/bin/hdfs
        else
          hdfsScript="$HADOOP_HDFS_HOME"/bin/hdfs
        fi
        nohup nice -n $HADOOP_NICENESS $hdfsScript --config $HADOOP_CONF_DIR $command "$@" > "$log" 2>&1 < /dev/null &
      ;;
      (*)
        nohup nice -n $HADOOP_NICENESS $hadoopScript --config $HADOOP_CONF_DIR $command "$@" > "$log" 2>&1 < /dev/null &
      ;;

4. hdfs

CLASS='org.apache.hadoop.hdfs.server.namenode.NameNode'
  HADOOP_OPTS="$HADOOP_OPTS $HADOOP_NAMENODE_OPTS"

5. NameNode

//进入NameNode,首先执行main方法
public static void main(String argv[]) throws Exception {
    //.........
    try {
      StringUtils.startupShutdownMessage(NameNode.class, argv, LOG);
      NameNode namenode = createNameNode(argv, null);
      if (namenode != null) {
        //加入线程,该线程执行完毕才走其他线程
        namenode.join();
      }
    } catch (Throwable e) {
      //.........
    }
  }


//org.apache.hadoop.hdfs.server.namenode.NameNode#createNameNode
//这里主要只说两个case,format:执行namenode格式化操作;default:namenode 启动操作
switch (startOpt) {
  case FORMAT: {
    boolean aborted = format(conf, startOpt.getForceFormat(),
        startOpt.getInteractiveFormat());
    terminate(aborted ? 1 : 0);
    return null; // avoid javac warning
  }
  //.......
  default: {
    DefaultMetricsSystem.initialize("NameNode");
    return new NameNode(conf);
  }
}
5.1 format
//org.apache.hadoop.hdfs.server.namenode.NameNode#format
private static boolean format(Configuration conf, boolean force,
      boolean isInteractive) throws IOException {
    String nsId = DFSUtil.getNamenodeNameServiceId(conf);
    String namenodeId = HAUtil.getNameNodeId(conf, nsId);
    initializeGenericKeys(conf, nsId, namenodeId);
    checkAllowFormat(conf);
	
    //kerberos 认证登录
    if (UserGroupInformation.isSecurityEnabled()) {
      InetSocketAddress socAddr = getAddress(conf);
      SecurityUtil.login(conf, DFS_NAMENODE_KEYTAB_FILE_KEY,
          DFS_NAMENODE_KERBEROS_PRINCIPAL_KEY, socAddr.getHostName());
    }
    // namenode fsimage,editlog 等元数据信息封装
    Collection<URI> nameDirsToFormat = FSNamesystem.getNamespaceDirs(conf);
    List<URI> sharedDirs = FSNamesystem.getSharedEditsDirs(conf);
    List<URI> dirsToPrompt = new ArrayList<URI>();
    dirsToPrompt.addAll(nameDirsToFormat);
    dirsToPrompt.addAll(sharedDirs);
    List<URI> editDirsToFormat = 
                 FSNamesystem.getNamespaceEditsDirs(conf);

    // if clusterID is not provided - see if you can find the current one
    String clusterId = StartupOption.FORMAT.getClusterId();
    if(clusterId == null || clusterId.equals("")) {
      //Generate a new cluster id
      clusterId = NNStorage.newClusterID();
    }
    System.out.println("Formatting using clusterid: " + clusterId);
    //实例化FSImage
    //关于FSImage的描述:FSImage handles checkpointing and logging of the namespace edits.
    FSImage fsImage = new FSImage(conf, nameDirsToFormat, editDirsToFormat);
    try {
      //实例化FSNamesystem
      //FSNamesystem does the actual bookkeeping work for the DataNode.
      FSNamesystem fsn = new FSNamesystem(conf, fsImage);
      fsImage.getEditLog().initJournalsForWrite();

      if (!fsImage.confirmFormat(force, isInteractive)) {
        return true; // aborted
      }
	  //namenode 格式化操作
      fsImage.format(fsn, clusterId);
    } catch (IOException ioe) {
      LOG.warn("Encountered exception during format: ", ioe);
      fsImage.close();
      throw ioe;
    }
    return false;
  }

//org.apache.hadoop.hdfs.server.namenode.FSImage#format
void format(FSNamesystem fsn, String clusterId) throws IOException {
    long fileCount = fsn.getTotalFiles();
    // Expect 1 file, which is the root inode
    Preconditions.checkState(fileCount == 1,
        "FSImage.format should be called with an uninitialized namesystem, has " +
        fileCount + " files");
    NamespaceInfo ns = NNStorage.newNamespaceInfo();
    LOG.info("Allocated new BlockPoolId: " + ns.getBlockPoolID());
    ns.clusterID = clusterId;
    //存储,editlog格式化
    storage.format(ns);
    editLog.formatNonFileJournals(ns);
    saveFSImageInAllDirs(fsn, 0);
  }
5.2 default
//org.apache.hadoop.hdfs.server.namenode.NameNode#NameNode 构造方法
try {
      initializeGenericKeys(conf, nsId, namenodeId);
      //最主要的方法
      initialize(conf);
      try {
        haContext.writeLock();
        state.prepareToEnterState(haContext);
        state.enterState(haContext);
      } finally {
        haContext.writeUnlock();
      }
    //......
}

//org.apache.hadoop.hdfs.server.namenode.NameNode#initialize
protected void initialize(Configuration conf) throws IOException {
    //........
    NameNode.initMetrics(conf, this.getRole());
    StartupProgressMetrics.register(startupProgress);

    if (NamenodeRole.NAMENODE == role) {
      //1.启动httpserver2 服务(50070)
      startHttpServer(conf);
    }

    this.spanReceiverHost =
      SpanReceiverHost.get(conf, DFSConfigKeys.DFS_SERVER_HTRACE_PREFIX);
	
    //2.将磁盘内的元数据信息加载到内存
    loadNamesystem(conf);

    //3.创建RpcServer
    rpcServer = createRpcServer(conf);
    if (clientNamenodeAddress == null) {
      // This is expected for MiniDFSCluster. Set it now using 
      // the RPC server's bind address.
      clientNamenodeAddress = 
          NetUtils.getHostPortString(rpcServer.getRpcAddress());
      LOG.info("Clients are to use " + clientNamenodeAddress + " to access"
          + " this namenode/service.");
    }
    if (NamenodeRole.NAMENODE == role) {
      httpServer.setNameNodeAddress(getNameNodeAddress());
      httpServer.setFSImage(getFSImage());
    }
    
    //jvm 内存等信息监控
    pauseMonitor = new JvmPauseMonitor(conf);
    pauseMonitor.start();
    metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);
    //4.启动服务
    startCommonServices(conf);
  }
5.2.1 startHttpServer
//org.apache.hadoop.hdfs.server.namenode.NameNode#startHttpServer
private void startHttpServer(final Configuration conf) throws IOException {
    httpServer = new NameNodeHttpServer(conf, this, getHttpServerBindAddress(conf));
    httpServer.start();
    httpServer.setStartupProgress(startupProgress);
}

//org.apache.hadoop.hdfs.server.namenode.NameNodeHttpServer#start
void start() throws IOException {
    //获取HTTP代理对象,构造socket连接
    HttpConfig.Policy policy = DFSUtil.getHttpPolicy(conf);
    //..........
	
    HttpServer2.Builder builder = DFSUtil.httpServerTemplateForNNAndJN(conf,
        httpAddr, httpsAddr, "hdfs",
        DFSConfigKeys.DFS_NAMENODE_KERBEROS_INTERNAL_SPNEGO_PRINCIPAL_KEY,
        DFSConfigKeys.DFS_NAMENODE_KEYTAB_FILE_KEY);

    httpServer = builder.build();

    if (policy.isHttpsEnabled()) {
      // assume same ssl port for all datanodes
      // 如果存在kerberos认证,需要在配置文件里面添加特殊的配置
      InetSocketAddress datanodeSslPort = NetUtils.createSocketAddr(conf.getTrimmed(
          DFSConfigKeys.DFS_DATANODE_HTTPS_ADDRESS_KEY, infoHost + ":"
              + DFSConfigKeys.DFS_DATANODE_HTTPS_DEFAULT_PORT));
      httpServer.setAttribute(DFSConfigKeys.DFS_DATANODE_HTTPS_PORT_KEY,
          datanodeSslPort.getPort());
    }

    initWebHdfs(conf);
	//.........
    //启动httpserver
    httpServer.start();
    //..............
  }
5.2.2 loadNamesystem
//org.apache.hadoop.hdfs.server.namenode.FSNamesystem#loadFromDisk
//Instantiates an FSNamesystem loaded from the image and edits
static FSNamesystem loadFromDisk(Configuration conf) throws IOException {
    checkConfiguration(conf);
    //从配置文件中获取NamespaceDirs,EditsDirs 元数据信息
    FSImage fsImage = new FSImage(conf,
        FSNamesystem.getNamespaceDirs(conf),
        FSNamesystem.getNamespaceEditsDirs(conf));
    FSNamesystem namesystem = new FSNamesystem(conf, fsImage, false);
    StartupOption startOpt = NameNode.getStartupOption(conf);
    if (startOpt == StartupOption.RECOVER) {
      namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
    }

    long loadStart = monotonicNow();
    try {
      //将元数据信息加载到内存
      //在加载的过程中,是需要加锁的.此时,nameNode不允许做任何操作
      namesystem.loadFSImage(startOpt);
    } catch (IOException ioe) {
      LOG.warn("Encountered exception loading fsimage", ioe);
      fsImage.close();
      throw ioe;
    }
    long timeTakenToLoadFSImage = monotonicNow() - loadStart;
    LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs");
    NameNodeMetrics nnMetrics = NameNode.getNameNodeMetrics();
    if (nnMetrics != null) {
      nnMetrics.setFsImageLoadTime((int) timeTakenToLoadFSImage);
    }
    return namesystem;
  }
5.2.3 createRpcServer
/**
   * Create the RPC server implementation. Used as an extension point for the
   * BackupNode.
   */
  protected NameNodeRpcServer createRpcServer(Configuration conf)
      throws IOException {
    return new NameNodeRpcServer(conf, this);
  }

//org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer#NameNodeRpcServer
//This class is responsible for handling all of the RPC calls to the NameNode.
5.2.4 startCommonServices
//org.apache.hadoop.hdfs.server.namenode.NameNode#startCommonServices
private void startCommonServices(Configuration conf) throws IOException {
    //最重要的一点
    namesystem.startCommonServices(conf, haContext);
    registerNNSMXBean();
    if (NamenodeRole.NAMENODE != role) {
      //启动HTTP服务
      startHttpServer(conf);
      httpServer.setNameNodeAddress(getNameNodeAddress());
      httpServer.setFSImage(getFSImage());
    }
    //启动RPC服务
    rpcServer.start();
    //......
  }
5.2.4.1 startCommonServices
/** 
   * Start services common to both active and standby states
   */
  void startCommonServices(Configuration conf, HAContext haContext) throws IOException {
    this.registerMBean(); // register the MBean for the FSNamesystemState
    writeLock();
    this.haContext = haContext;
    try {
     
      //实例化 NameNodeResourceChecker,用于检查磁盘资源
      //active状态的namenod会启动一个监控线程NameNodeResourceMonitor,定期执行NameNodeResourceChecker#hasAvailableDiskSpace()检查可用的磁盘资源。
      //主要检查Fsimage,editlog目录
      nnResourceChecker = new NameNodeResourceChecker(conf);
      //检查可用资源是否足够:如果不够,日志打印警告信息,然后进入安全模式
      checkAvailableResources();
      // 判断是否进入安全模式,并且副本队列是否应该被同步/复制
      /**
        * 磁盘资源不足的情况下,任何对元数据修改所产生的日志都无法确保能够写入到磁盘,
        * 即新产生的edits log和fsimage都无法确保写入磁盘。所以要进入安全模式,
        * 来禁止元数据的变动以避免往磁盘写入新的日志数据
        * */
      assert safeMode != null && !isPopulatingReplQueues();
      //获取StartupProgress实例用来获取NameNode各任务的启动信息
      StartupProgress prog = NameNode.getStartupProgress();
      //目前NameNode启动,进入到safemode阶段,处于一个等待汇报blocks的状态
      prog.beginPhase(Phase.SAFEMODE);
      long completeBlocksTotal = getCompleteBlocksTotal();
      prog.setTotal(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS,completeBlocksTotal);
      //设置所有的block,用于后面判断是否进入安全模式
      setBlockTotal(completeBlocksTotal);
      //激活BlockManager
      blockManager.activate(conf);
      //关于blockmanager介绍:Keeps information related to the blocks stored in the Hadoop cluster.
    } finally {
      writeUnlock();
    }
    
    registerMXBean();
    DefaultMetricsSystem.instance().register(this);
    if (inodeAttributeProvider != null) {
      inodeAttributeProvider.start();
      dir.setINodeAttributeProvider(inodeAttributeProvider);
    }
    snapshotManager.registerMXBean();
  }
5.2.4.2 NameNodeResourceChecker
public NameNodeResourceChecker(Configuration conf) throws IOException {
    this.conf = conf;
    //创建集合,用于存放需要检查的路径
    volumes = new HashMap<String, CheckedVolume>();
	//磁盘最小容忍刻度 默认100M
    duReserved = conf.getLong(DFSConfigKeys.DFS_NAMENODE_DU_RESERVED_KEY,
        DFSConfigKeys.DFS_NAMENODE_DU_RESERVED_DEFAULT);
    //获取本地元数据列表
    Collection<URI> extraCheckedVolumes = Util.stringCollectionAsURIs(conf
        .getTrimmedStringCollection(DFSConfigKeys.DFS_NAMENODE_CHECKED_VOLUMES_KEY));
    //获取共享目录(HA模式下)
    Collection<URI> localEditDirs = Collections2.filter(
        FSNamesystem.getNamespaceEditsDirs(conf),
        new Predicate<URI>() {
          @Override
          public boolean apply(URI input) {
            if (input.getScheme().equals(NNStorage.LOCAL_URI_SCHEME)) {
              return true;
            }
            return false;
          }
        });

    // Add all the local edits dirs, marking some as required if they are
    // configured as such.
    for (URI editsDirToCheck : localEditDirs) {
      addDirToCheck(editsDirToCheck,
          FSNamesystem.getRequiredNamespaceEditsDirs(conf).contains(
              editsDirToCheck));
    }

    // All extra checked volumes are marked "required"
    for (URI extraDirToCheck : extraCheckedVolumes) {
      addDirToCheck(extraDirToCheck, true);
    }
    
    minimumRedundantVolumes = conf.getInt(
        DFSConfigKeys.DFS_NAMENODE_CHECKED_VOLUMES_MINIMUM_KEY,
        DFSConfigKeys.DFS_NAMENODE_CHECKED_VOLUMES_MINIMUM_DEFAULT);
  }

//我们在前面说到的,NameNodeResourceMonitor在哪里使用呢?
//org.apache.hadoop.hdfs.server.namenode.FSNamesystem#startActiveServices
//大概在1130行
//ResourceMonitor required only at ActiveNN. See HDFS-2914
this.nnrmthread = new Daemon(new NameNodeResourceMonitor());
nnrmthread.start();
//启动一个线程实时监控

//一旦磁盘空间不足,开启安全模式
public void run () {
      try {
        while (fsRunning && shouldNNRmRun) {
          checkAvailableResources();
          if(!nameNodeHasResourcesAvailable()) {
            String lowResourcesMsg = "NameNode low on available disk space. ";
            if (!isInSafeMode()) {
              LOG.warn(lowResourcesMsg + "Entering safe mode.");
            } else {
              LOG.warn(lowResourcesMsg + "Already in safe mode.");
            }
            enterSafeMode(true);
          }
          try {
            Thread.sleep(resourceRecheckInterval);
          } catch (InterruptedException ie) {
            // Deliberately ignore
          }
        }
      } catch (Exception e) {
        FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e);
      }
    }
5.2.4.3 checkAvailableResources
//org.apache.hadoop.hdfs.server.namenode.FSNamesystem#checkAvailableResources
void checkAvailableResources() {
    Preconditions.checkState(nnResourceChecker != null,
        "nnResourceChecker not initialized");
    hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
}
//初始化nameNode的时候会主动检查一次,启动后就会通过NameNodeResourceMonitor这个线程不断的去检查(每隔1秒去检查一遍)
5.2.4.4 getStartupProgress
StartupProgress prog = NameNode.getStartupProgress();
//通过getStartupProgress来指示namonode运行状态,namenode启动后,先进入安全模式,然后等待汇报blocks的状态,只有blocks状态满足最低指标需求,才会退出安全模式;
5.4.4.5 getCompleteBlocksTotal
long completeBlocksTotal = getCompleteBlocksTotal();
prog.setTotal(Phase.SAFEMODE,STEP_AWAITING_REPORTED_BLOCKS,completeBlocksTotal);
setBlockTotal(completeBlocksTotal);
//namonode通过setBlockTotal()来回报blocks的状态,那么blocks的状态怎么拿到?
//其实就是通过getCompleteBlocksTotal()

/**
   * Get the total number of COMPLETE blocks in the system.
   * For safe mode only complete blocks are counted.
   * This is invoked only during NN startup and checkpointing.
   */
  public long getCompleteBlocksTotal() {
    // Calculate number of blocks under construction
    long numUCBlocks = 0;
    readLock();
    try {
      //在HDFS中,文件有四种状态:
        //1.正在被写入:UnderConstruction
        //2.正在被恢复:UnderRecover
        //3.还没被写入:Committed
        //4.已经被写入:complete
      //在启动时,namonode只会去读取complete状态的block
      numUCBlocks = leaseManager.getNumUnderConstructionBlocks();
      return getBlocksTotal() - numUCBlocks;
    } finally {
      readUnlock();
    }
  }/**
   * This method iterates through all the leases and counts the number of blocks
   * which are not COMPLETE. The FSNamesystem read lock MUST be held before
   * calling this method.
   */
  //编译每一个文件加,做++操作
  synchronized long getNumUnderConstructionBlocks() {
    assert this.fsnamesystem.hasReadLock() : "The FSNamesystem read lock wasn't"
      + "acquired before counting under construction blocks";
    long numUCBlocks = 0;
    for (Long id : getINodeIdWithLeases()) {
      final INodeFile cons = fsnamesystem.getFSDirectory().getInode(id).asFile();
      Preconditions.checkState(cons.isUnderConstruction());
      BlockInfo[] blocks = cons.getBlocks();
      if(blocks == null) {
        continue;
      }
      for(BlockInfo b : blocks) {
        if(!b.isComplete())
          numUCBlocks++;
      }
    }
    LOG.info("Number of blocks under construction: " + numUCBlocks);
    return numUCBlocks;
  }
5.4.4.6 setBlockTotal
setBlockTotal(completeBlocksTotal);
//
public void setBlockTotal(long completeBlocksTotal) {
    // safeMode is volatile, and may be set to null at any time
    SafeModeInfo safeMode = this.safeMode;
    if (safeMode == null)
      return;
    safeMode.setBlockTotal((int) completeBlocksTotal);
}

/**
 * Set total number of blocks.
 */
private synchronized void setBlockTotal(int total) {
  //获取所有汇总来的block
  this.blockTotal = total;
  //获取一个阈值,比如说:1000 * 0.999f
  this.blockThreshold = (int) (blockTotal * threshold);
  this.blockReplQueueThreshold = 
    (int) (blockTotal * replQueueThreshold);
  if (haEnabled) {
    // After we initialize the block count, any further namespace
    // modifications done while in safe mode need to keep track
    // of the number of total blocks in the system.
    this.shouldIncrementallyTrackBlocks = true;
  }
  if(blockSafe < 0)
    this.blockSafe = 0;
  //根据汇总block 值来判断是否需要进入安全模式(>=999,退出安全模式)
  //用于检查安全模式的状态:
  //1、判断阈值系数是否满足进入安全模式:needEnter
  //对于离开安全模式,有两个条件判断:
  //1、判断系数是否满足离开安全模式
  //2、启动SafeModeMonitor线程,每隔1秒去查看下,是否可以退出安全模式
  checkMode();
}


/**
 * Check and trigger safe mode if needed. 
 */
private void checkMode() {
  //安全模式下,禁止写入,执行写锁
  assert hasWriteLock();
  //如果当前节点时active,则不需要检查
  if (inTransitionToActive()) {
    return;
  }
  //根据 needEnter判断是否需要进入安全模式
  if (smmthread == null && needEnter()) {
    enter();
    //..................
  }
  //.........................
}


/** 
 * There is no need to enter safe mode 
 * if DFS is empty or {@link #threshold} == 0
 */
private boolean needEnter() {
  return (threshold != 0 && blockSafe < blockThreshold) ||
    (datanodeThreshold != 0 && getNumLiveDataNodes() < datanodeThreshold) ||
    (!nameNodeHasResourcesAvailable());
}
//看一下啥条件下执行安全模式:
//1.汇报block的总数量小于 block阈值(block =1000,blockThreshold= 1000 * 0.999f)
//2.存活的datanode的数据小于datanodeThreshold阈值0
//3.namenode 元数据存储磁盘不足100M

//以上便是NameNode启动全流程