HDFS源码之NameNode 启动流程
1. start-dfs.sh
"$HADOOP_PREFIX/sbin/hadoop-daemons.sh"
--config "$HADOOP_CONF_DIR" \
--hostnames "$NAMENODES" \
--script "$bin/hdfs" start namenode $nameStartOpt
2. hadoop-daemons.sh
"$bin/slaves.sh" --config $HADOOP_CONF_DIR cd "$HADOOP_PREFIX" \; "$bin/hadoop-daemon.sh" --config $HADOOP_CONF_DIR "$@"
3. hadoop-daemon.sh
case $command in
namenode|secondarynamenode|datanode|journalnode|dfs|dfsadmin|fsck|balancer|zkfc)
if [ -z "$HADOOP_HDFS_HOME" ]; then
hdfsScript="$HADOOP_PREFIX"/bin/hdfs
else
hdfsScript="$HADOOP_HDFS_HOME"/bin/hdfs
fi
nohup nice -n $HADOOP_NICENESS $hdfsScript --config $HADOOP_CONF_DIR $command "$@" > "$log" 2>&1 < /dev/null &
;;
(*)
nohup nice -n $HADOOP_NICENESS $hadoopScript --config $HADOOP_CONF_DIR $command "$@" > "$log" 2>&1 < /dev/null &
;;
4. hdfs
CLASS='org.apache.hadoop.hdfs.server.namenode.NameNode'
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_NAMENODE_OPTS"
5. NameNode
//进入NameNode,首先执行main方法
public static void main(String argv[]) throws Exception {
//.........
try {
StringUtils.startupShutdownMessage(NameNode.class, argv, LOG);
NameNode namenode = createNameNode(argv, null);
if (namenode != null) {
//加入线程,该线程执行完毕才走其他线程
namenode.join();
}
} catch (Throwable e) {
//.........
}
}
//org.apache.hadoop.hdfs.server.namenode.NameNode#createNameNode
//这里主要只说两个case,format:执行namenode格式化操作;default:namenode 启动操作
switch (startOpt) {
case FORMAT: {
boolean aborted = format(conf, startOpt.getForceFormat(),
startOpt.getInteractiveFormat());
terminate(aborted ? 1 : 0);
return null; // avoid javac warning
}
//.......
default: {
DefaultMetricsSystem.initialize("NameNode");
return new NameNode(conf);
}
}
5.1 format
//org.apache.hadoop.hdfs.server.namenode.NameNode#format
private static boolean format(Configuration conf, boolean force,
boolean isInteractive) throws IOException {
String nsId = DFSUtil.getNamenodeNameServiceId(conf);
String namenodeId = HAUtil.getNameNodeId(conf, nsId);
initializeGenericKeys(conf, nsId, namenodeId);
checkAllowFormat(conf);
//kerberos 认证登录
if (UserGroupInformation.isSecurityEnabled()) {
InetSocketAddress socAddr = getAddress(conf);
SecurityUtil.login(conf, DFS_NAMENODE_KEYTAB_FILE_KEY,
DFS_NAMENODE_KERBEROS_PRINCIPAL_KEY, socAddr.getHostName());
}
// namenode fsimage,editlog 等元数据信息封装
Collection<URI> nameDirsToFormat = FSNamesystem.getNamespaceDirs(conf);
List<URI> sharedDirs = FSNamesystem.getSharedEditsDirs(conf);
List<URI> dirsToPrompt = new ArrayList<URI>();
dirsToPrompt.addAll(nameDirsToFormat);
dirsToPrompt.addAll(sharedDirs);
List<URI> editDirsToFormat =
FSNamesystem.getNamespaceEditsDirs(conf);
// if clusterID is not provided - see if you can find the current one
String clusterId = StartupOption.FORMAT.getClusterId();
if(clusterId == null || clusterId.equals("")) {
//Generate a new cluster id
clusterId = NNStorage.newClusterID();
}
System.out.println("Formatting using clusterid: " + clusterId);
//实例化FSImage
//关于FSImage的描述:FSImage handles checkpointing and logging of the namespace edits.
FSImage fsImage = new FSImage(conf, nameDirsToFormat, editDirsToFormat);
try {
//实例化FSNamesystem
//FSNamesystem does the actual bookkeeping work for the DataNode.
FSNamesystem fsn = new FSNamesystem(conf, fsImage);
fsImage.getEditLog().initJournalsForWrite();
if (!fsImage.confirmFormat(force, isInteractive)) {
return true; // aborted
}
//namenode 格式化操作
fsImage.format(fsn, clusterId);
} catch (IOException ioe) {
LOG.warn("Encountered exception during format: ", ioe);
fsImage.close();
throw ioe;
}
return false;
}
//org.apache.hadoop.hdfs.server.namenode.FSImage#format
void format(FSNamesystem fsn, String clusterId) throws IOException {
long fileCount = fsn.getTotalFiles();
// Expect 1 file, which is the root inode
Preconditions.checkState(fileCount == 1,
"FSImage.format should be called with an uninitialized namesystem, has " +
fileCount + " files");
NamespaceInfo ns = NNStorage.newNamespaceInfo();
LOG.info("Allocated new BlockPoolId: " + ns.getBlockPoolID());
ns.clusterID = clusterId;
//存储,editlog格式化
storage.format(ns);
editLog.formatNonFileJournals(ns);
saveFSImageInAllDirs(fsn, 0);
}
5.2 default
//org.apache.hadoop.hdfs.server.namenode.NameNode#NameNode 构造方法
try {
initializeGenericKeys(conf, nsId, namenodeId);
//最主要的方法
initialize(conf);
try {
haContext.writeLock();
state.prepareToEnterState(haContext);
state.enterState(haContext);
} finally {
haContext.writeUnlock();
}
//......
}
//org.apache.hadoop.hdfs.server.namenode.NameNode#initialize
protected void initialize(Configuration conf) throws IOException {
//........
NameNode.initMetrics(conf, this.getRole());
StartupProgressMetrics.register(startupProgress);
if (NamenodeRole.NAMENODE == role) {
//1.启动httpserver2 服务(50070)
startHttpServer(conf);
}
this.spanReceiverHost =
SpanReceiverHost.get(conf, DFSConfigKeys.DFS_SERVER_HTRACE_PREFIX);
//2.将磁盘内的元数据信息加载到内存
loadNamesystem(conf);
//3.创建RpcServer
rpcServer = createRpcServer(conf);
if (clientNamenodeAddress == null) {
// This is expected for MiniDFSCluster. Set it now using
// the RPC server's bind address.
clientNamenodeAddress =
NetUtils.getHostPortString(rpcServer.getRpcAddress());
LOG.info("Clients are to use " + clientNamenodeAddress + " to access"
+ " this namenode/service.");
}
if (NamenodeRole.NAMENODE == role) {
httpServer.setNameNodeAddress(getNameNodeAddress());
httpServer.setFSImage(getFSImage());
}
//jvm 内存等信息监控
pauseMonitor = new JvmPauseMonitor(conf);
pauseMonitor.start();
metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);
//4.启动服务
startCommonServices(conf);
}
5.2.1 startHttpServer
//org.apache.hadoop.hdfs.server.namenode.NameNode#startHttpServer
private void startHttpServer(final Configuration conf) throws IOException {
httpServer = new NameNodeHttpServer(conf, this, getHttpServerBindAddress(conf));
httpServer.start();
httpServer.setStartupProgress(startupProgress);
}
//org.apache.hadoop.hdfs.server.namenode.NameNodeHttpServer#start
void start() throws IOException {
//获取HTTP代理对象,构造socket连接
HttpConfig.Policy policy = DFSUtil.getHttpPolicy(conf);
//..........
HttpServer2.Builder builder = DFSUtil.httpServerTemplateForNNAndJN(conf,
httpAddr, httpsAddr, "hdfs",
DFSConfigKeys.DFS_NAMENODE_KERBEROS_INTERNAL_SPNEGO_PRINCIPAL_KEY,
DFSConfigKeys.DFS_NAMENODE_KEYTAB_FILE_KEY);
httpServer = builder.build();
if (policy.isHttpsEnabled()) {
// assume same ssl port for all datanodes
// 如果存在kerberos认证,需要在配置文件里面添加特殊的配置
InetSocketAddress datanodeSslPort = NetUtils.createSocketAddr(conf.getTrimmed(
DFSConfigKeys.DFS_DATANODE_HTTPS_ADDRESS_KEY, infoHost + ":"
+ DFSConfigKeys.DFS_DATANODE_HTTPS_DEFAULT_PORT));
httpServer.setAttribute(DFSConfigKeys.DFS_DATANODE_HTTPS_PORT_KEY,
datanodeSslPort.getPort());
}
initWebHdfs(conf);
//.........
//启动httpserver
httpServer.start();
//..............
}
5.2.2 loadNamesystem
//org.apache.hadoop.hdfs.server.namenode.FSNamesystem#loadFromDisk
//Instantiates an FSNamesystem loaded from the image and edits
static FSNamesystem loadFromDisk(Configuration conf) throws IOException {
checkConfiguration(conf);
//从配置文件中获取NamespaceDirs,EditsDirs 元数据信息
FSImage fsImage = new FSImage(conf,
FSNamesystem.getNamespaceDirs(conf),
FSNamesystem.getNamespaceEditsDirs(conf));
FSNamesystem namesystem = new FSNamesystem(conf, fsImage, false);
StartupOption startOpt = NameNode.getStartupOption(conf);
if (startOpt == StartupOption.RECOVER) {
namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
}
long loadStart = monotonicNow();
try {
//将元数据信息加载到内存
//在加载的过程中,是需要加锁的.此时,nameNode不允许做任何操作
namesystem.loadFSImage(startOpt);
} catch (IOException ioe) {
LOG.warn("Encountered exception loading fsimage", ioe);
fsImage.close();
throw ioe;
}
long timeTakenToLoadFSImage = monotonicNow() - loadStart;
LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs");
NameNodeMetrics nnMetrics = NameNode.getNameNodeMetrics();
if (nnMetrics != null) {
nnMetrics.setFsImageLoadTime((int) timeTakenToLoadFSImage);
}
return namesystem;
}
5.2.3 createRpcServer
/**
* Create the RPC server implementation. Used as an extension point for the
* BackupNode.
*/
protected NameNodeRpcServer createRpcServer(Configuration conf)
throws IOException {
return new NameNodeRpcServer(conf, this);
}
//org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer#NameNodeRpcServer
//This class is responsible for handling all of the RPC calls to the NameNode.
5.2.4 startCommonServices
//org.apache.hadoop.hdfs.server.namenode.NameNode#startCommonServices
private void startCommonServices(Configuration conf) throws IOException {
//最重要的一点
namesystem.startCommonServices(conf, haContext);
registerNNSMXBean();
if (NamenodeRole.NAMENODE != role) {
//启动HTTP服务
startHttpServer(conf);
httpServer.setNameNodeAddress(getNameNodeAddress());
httpServer.setFSImage(getFSImage());
}
//启动RPC服务
rpcServer.start();
//......
}
5.2.4.1 startCommonServices
/**
* Start services common to both active and standby states
*/
void startCommonServices(Configuration conf, HAContext haContext) throws IOException {
this.registerMBean(); // register the MBean for the FSNamesystemState
writeLock();
this.haContext = haContext;
try {
//实例化 NameNodeResourceChecker,用于检查磁盘资源
//active状态的namenod会启动一个监控线程NameNodeResourceMonitor,定期执行NameNodeResourceChecker#hasAvailableDiskSpace()检查可用的磁盘资源。
//主要检查Fsimage,editlog目录
nnResourceChecker = new NameNodeResourceChecker(conf);
//检查可用资源是否足够:如果不够,日志打印警告信息,然后进入安全模式
checkAvailableResources();
// 判断是否进入安全模式,并且副本队列是否应该被同步/复制
/**
* 磁盘资源不足的情况下,任何对元数据修改所产生的日志都无法确保能够写入到磁盘,
* 即新产生的edits log和fsimage都无法确保写入磁盘。所以要进入安全模式,
* 来禁止元数据的变动以避免往磁盘写入新的日志数据
* */
assert safeMode != null && !isPopulatingReplQueues();
//获取StartupProgress实例用来获取NameNode各任务的启动信息
StartupProgress prog = NameNode.getStartupProgress();
//目前NameNode启动,进入到safemode阶段,处于一个等待汇报blocks的状态
prog.beginPhase(Phase.SAFEMODE);
long completeBlocksTotal = getCompleteBlocksTotal();
prog.setTotal(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS,completeBlocksTotal);
//设置所有的block,用于后面判断是否进入安全模式
setBlockTotal(completeBlocksTotal);
//激活BlockManager
blockManager.activate(conf);
//关于blockmanager介绍:Keeps information related to the blocks stored in the Hadoop cluster.
} finally {
writeUnlock();
}
registerMXBean();
DefaultMetricsSystem.instance().register(this);
if (inodeAttributeProvider != null) {
inodeAttributeProvider.start();
dir.setINodeAttributeProvider(inodeAttributeProvider);
}
snapshotManager.registerMXBean();
}
5.2.4.2 NameNodeResourceChecker
public NameNodeResourceChecker(Configuration conf) throws IOException {
this.conf = conf;
//创建集合,用于存放需要检查的路径
volumes = new HashMap<String, CheckedVolume>();
//磁盘最小容忍刻度 默认100M
duReserved = conf.getLong(DFSConfigKeys.DFS_NAMENODE_DU_RESERVED_KEY,
DFSConfigKeys.DFS_NAMENODE_DU_RESERVED_DEFAULT);
//获取本地元数据列表
Collection<URI> extraCheckedVolumes = Util.stringCollectionAsURIs(conf
.getTrimmedStringCollection(DFSConfigKeys.DFS_NAMENODE_CHECKED_VOLUMES_KEY));
//获取共享目录(HA模式下)
Collection<URI> localEditDirs = Collections2.filter(
FSNamesystem.getNamespaceEditsDirs(conf),
new Predicate<URI>() {
@Override
public boolean apply(URI input) {
if (input.getScheme().equals(NNStorage.LOCAL_URI_SCHEME)) {
return true;
}
return false;
}
});
// Add all the local edits dirs, marking some as required if they are
// configured as such.
for (URI editsDirToCheck : localEditDirs) {
addDirToCheck(editsDirToCheck,
FSNamesystem.getRequiredNamespaceEditsDirs(conf).contains(
editsDirToCheck));
}
// All extra checked volumes are marked "required"
for (URI extraDirToCheck : extraCheckedVolumes) {
addDirToCheck(extraDirToCheck, true);
}
minimumRedundantVolumes = conf.getInt(
DFSConfigKeys.DFS_NAMENODE_CHECKED_VOLUMES_MINIMUM_KEY,
DFSConfigKeys.DFS_NAMENODE_CHECKED_VOLUMES_MINIMUM_DEFAULT);
}
//我们在前面说到的,NameNodeResourceMonitor在哪里使用呢?
//org.apache.hadoop.hdfs.server.namenode.FSNamesystem#startActiveServices
//大概在1130行
//ResourceMonitor required only at ActiveNN. See HDFS-2914
this.nnrmthread = new Daemon(new NameNodeResourceMonitor());
nnrmthread.start();
//启动一个线程实时监控
//一旦磁盘空间不足,开启安全模式
public void run () {
try {
while (fsRunning && shouldNNRmRun) {
checkAvailableResources();
if(!nameNodeHasResourcesAvailable()) {
String lowResourcesMsg = "NameNode low on available disk space. ";
if (!isInSafeMode()) {
LOG.warn(lowResourcesMsg + "Entering safe mode.");
} else {
LOG.warn(lowResourcesMsg + "Already in safe mode.");
}
enterSafeMode(true);
}
try {
Thread.sleep(resourceRecheckInterval);
} catch (InterruptedException ie) {
// Deliberately ignore
}
}
} catch (Exception e) {
FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e);
}
}
5.2.4.3 checkAvailableResources
//org.apache.hadoop.hdfs.server.namenode.FSNamesystem#checkAvailableResources
void checkAvailableResources() {
Preconditions.checkState(nnResourceChecker != null,
"nnResourceChecker not initialized");
hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
}
//初始化nameNode的时候会主动检查一次,启动后就会通过NameNodeResourceMonitor这个线程不断的去检查(每隔1秒去检查一遍)
5.2.4.4 getStartupProgress
StartupProgress prog = NameNode.getStartupProgress();
//通过getStartupProgress来指示namonode运行状态,namenode启动后,先进入安全模式,然后等待汇报blocks的状态,只有blocks状态满足最低指标需求,才会退出安全模式;
5.4.4.5 getCompleteBlocksTotal
long completeBlocksTotal = getCompleteBlocksTotal();
prog.setTotal(Phase.SAFEMODE,STEP_AWAITING_REPORTED_BLOCKS,completeBlocksTotal);
setBlockTotal(completeBlocksTotal);
//namonode通过setBlockTotal()来回报blocks的状态,那么blocks的状态怎么拿到?
//其实就是通过getCompleteBlocksTotal()
/**
* Get the total number of COMPLETE blocks in the system.
* For safe mode only complete blocks are counted.
* This is invoked only during NN startup and checkpointing.
*/
public long getCompleteBlocksTotal() {
// Calculate number of blocks under construction
long numUCBlocks = 0;
readLock();
try {
//在HDFS中,文件有四种状态:
//1.正在被写入:UnderConstruction
//2.正在被恢复:UnderRecover
//3.还没被写入:Committed
//4.已经被写入:complete
//在启动时,namonode只会去读取complete状态的block
numUCBlocks = leaseManager.getNumUnderConstructionBlocks();
return getBlocksTotal() - numUCBlocks;
} finally {
readUnlock();
}
}/**
* This method iterates through all the leases and counts the number of blocks
* which are not COMPLETE. The FSNamesystem read lock MUST be held before
* calling this method.
*/
//编译每一个文件加,做++操作
synchronized long getNumUnderConstructionBlocks() {
assert this.fsnamesystem.hasReadLock() : "The FSNamesystem read lock wasn't"
+ "acquired before counting under construction blocks";
long numUCBlocks = 0;
for (Long id : getINodeIdWithLeases()) {
final INodeFile cons = fsnamesystem.getFSDirectory().getInode(id).asFile();
Preconditions.checkState(cons.isUnderConstruction());
BlockInfo[] blocks = cons.getBlocks();
if(blocks == null) {
continue;
}
for(BlockInfo b : blocks) {
if(!b.isComplete())
numUCBlocks++;
}
}
LOG.info("Number of blocks under construction: " + numUCBlocks);
return numUCBlocks;
}
5.4.4.6 setBlockTotal
setBlockTotal(completeBlocksTotal);
//
public void setBlockTotal(long completeBlocksTotal) {
// safeMode is volatile, and may be set to null at any time
SafeModeInfo safeMode = this.safeMode;
if (safeMode == null)
return;
safeMode.setBlockTotal((int) completeBlocksTotal);
}
/**
* Set total number of blocks.
*/
private synchronized void setBlockTotal(int total) {
//获取所有汇总来的block
this.blockTotal = total;
//获取一个阈值,比如说:1000 * 0.999f
this.blockThreshold = (int) (blockTotal * threshold);
this.blockReplQueueThreshold =
(int) (blockTotal * replQueueThreshold);
if (haEnabled) {
// After we initialize the block count, any further namespace
// modifications done while in safe mode need to keep track
// of the number of total blocks in the system.
this.shouldIncrementallyTrackBlocks = true;
}
if(blockSafe < 0)
this.blockSafe = 0;
//根据汇总block 值来判断是否需要进入安全模式(>=999,退出安全模式)
//用于检查安全模式的状态:
//1、判断阈值系数是否满足进入安全模式:needEnter
//对于离开安全模式,有两个条件判断:
//1、判断系数是否满足离开安全模式
//2、启动SafeModeMonitor线程,每隔1秒去查看下,是否可以退出安全模式
checkMode();
}
/**
* Check and trigger safe mode if needed.
*/
private void checkMode() {
//安全模式下,禁止写入,执行写锁
assert hasWriteLock();
//如果当前节点时active,则不需要检查
if (inTransitionToActive()) {
return;
}
//根据 needEnter判断是否需要进入安全模式
if (smmthread == null && needEnter()) {
enter();
//..................
}
//.........................
}
/**
* There is no need to enter safe mode
* if DFS is empty or {@link #threshold} == 0
*/
private boolean needEnter() {
return (threshold != 0 && blockSafe < blockThreshold) ||
(datanodeThreshold != 0 && getNumLiveDataNodes() < datanodeThreshold) ||
(!nameNodeHasResourcesAvailable());
}
//看一下啥条件下执行安全模式:
//1.汇报block的总数量小于 block阈值(block =1000,blockThreshold= 1000 * 0.999f)
//2.存活的datanode的数据小于datanodeThreshold阈值0
//3.namenode 元数据存储磁盘不足100M
//以上便是NameNode启动全流程