
  • 总体流程
  • Demo


hdfs回复初始化 hdfs初始化失败_HDFS


public class FileTest {
	public static void main(String[] args) throws IOException {
		Configuration configuration=new Configuration();
		FileSystem fileSystem=FileSystem.newInstance(configuration);
		//TODO 创建目录(分析的是元数据的管理流程)
		fileSystem.mkdirs(new Path(""));
		 * TODO 接下来分析HDFS上传文件的流程
		 * TODO 做一些重要的初始化工作
		FSDataOutputStream fsous=fileSystem.create(new Path("/user.txt"));		
		//TODO 完成上传文件的流程
		 * master:
		 *     启动
		 * worker:
		 *     启动
		 *   wordcount

public FSDataOutputStream create(Path f) throws IOException {
    //TODO 重要
    return create(f, true);


public FSDataOutputStream create(final Path f, final FsPermission permission,
    final EnumSet<CreateFlag> cflags, final int bufferSize,
    final short replication, final long blockSize, final Progressable progress,
    final ChecksumOpt checksumOpt) throws IOException {
    Path absF = fixRelativePart(f);
    return new FileSystemLinkResolver<FSDataOutputStream>() {
      public FSDataOutputStream doCall(final Path p)
          throws IOException, UnresolvedLinkException {
    	  //TODO 创建了一个DFSOutputStream,做了很多初始化操作
         *  * 1) 往文件目录树里面添加了INodeFile
         *  * 2) 添加了契约管理
         *  * 3) 启动了DataStreamer(写数据流程的关键服务)
        final DFSOutputStream dfsos = dfs.create(getPathName(p), permission,
                cflags, replication, blockSize, progress, bufferSize,
        //TODO FSDataOutputStream 是DFSOutputStream 进行了再一次的封装。【装饰模式】
        return dfs.createWrappedOutputStream(dfsos, statistics);
      public FSDataOutputStream next(final FileSystem fs, final Path p)
          throws IOException {
        return fs.create(p, permission, cflags, bufferSize,
            replication, blockSize, progress, checksumOpt);
    }.resolve(this, absF);
客户端写文件的时候先向 Namenode申请契约,拥有契约的在规定时间拥有文件的写权限。关闭文件时回收契约。
public DFSOutputStream create(String src, FsPermission permission, EnumSet<CreateFlag> flag, boolean createParent,
			short replication, long blockSize, Progressable progress, int buffersize, ChecksumOpt checksumOpt,
			InetSocketAddress[] favoredNodes) throws IOException {
		if (permission == null) {
			permission = FsPermission.getFileDefault();
		FsPermission masked = permission.applyUMask(dfsClientConf.uMask);
		if (LOG.isDebugEnabled()) {
			LOG.debug(src + ": masked=" + masked);
		 * 总结:
		 * 1) 往文件目录树里面添加了文件
		 * 2) 添加了契约管理
		 * 3) 启动了DataStreamer
		final DFSOutputStream result = DFSOutputStream.newStreamForCreate(this, src, masked, flag, createParent,
				replication, blockSize, progress, buffersize, dfsClientConf.createChecksum(checksumOpt),
		//TODO 开启契约
		beginFileLease(result.getFileId(), result);
		return result;
static DFSOutputStream newStreamForCreate(DFSClient dfsClient, String src, FsPermission masked,
			EnumSet<CreateFlag> flag, boolean createParent, short replication, long blockSize, Progressable progress,
			int buffersize, DataChecksum checksum, String[] favoredNodes) throws IOException {
		TraceScope scope = dfsClient.getPathTraceScope("newStreamForCreate", src);
		try {
			HdfsFileStatus stat = null;

			// Retry the create if we get a RetryStartFileException up to a maximum
			// number of times
			boolean shouldRetry = true;
			int retryCount = CREATE_RETRY_COUNT;
			//TODO 重试的代码结构
			while (shouldRetry) {
				shouldRetry = false;
				try {
					 * HDFS原理总结:
					 * 创建目录:就是在 目录树(元数据)上面添加一个子Node (INodeDirectory)
					 * 上传文件:
					 *     1)在目录树里面添加一个字Node(InodeFile)
					 *     2)再往文件里面写数据
						 TODO 往目录树里添加InodeFile,记录元数据日志和添加契约
					stat = dfsClient.namenode.create(src, masked, dfsClient.clientName,
							new EnumSetWritable<CreateFlag>(flag), createParent, replication, blockSize,
				} catch (RemoteException re) {
					IOException e = re.unwrapRemoteException(AccessControlException.class,
							DSQuotaExceededException.class, FileAlreadyExistsException.class,
							FileNotFoundException.class, ParentNotDirectoryException.class,
							NSQuotaExceededException.class, RetryStartFileException.class, SafeModeException.class,
							UnresolvedPathException.class, SnapshotAccessControlException.class,
					if (e instanceof RetryStartFileException) {
						//TODO 重试
						if (retryCount > 0) {
							shouldRetry = true;
						} else {
							throw new IOException("Too many retries because of encryption" + " zone operations", e);
					} else {
						throw e;
			Preconditions.checkNotNull(stat, "HdfsFileStatus should not be null!");

			//TODO 里面初始化了DataStreamer,DataStreamer是写数据流程里面重要的对象
			final DFSOutputStream out = new DFSOutputStream(dfsClient, src, stat, flag, progress, checksum,
			//TODO 里面启动了DataStreamer
			return out;
		} finally {


public HdfsFileStatus create(String src, FsPermission masked,
      String clientName, EnumSetWritable<CreateFlag> flag,
      boolean createParent, short replication, long blockSize, 
      CryptoProtocolVersion[] supportedVersions)
      throws IOException {
    String clientMachine = getClientMachine();
    if (stateChangeLog.isDebugEnabled()) {
      stateChangeLog.debug("*DIR* NameNode.create: file "
          +src+" for "+clientName+" at "+clientMachine);
    if (!checkPathLength(src)) {
      throw new IOException("create: Pathname too long.  Limit "
          + MAX_PATH_LENGTH + " characters, " + MAX_PATH_DEPTH + " levels.");

    CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache, null);
    if (cacheEntry != null && cacheEntry.isSuccess()) {
      return (HdfsFileStatus) cacheEntry.getPayload();

    HdfsFileStatus status = null;
    try {
      PermissionStatus perm = new PermissionStatus(getRemoteUser()
          .getShortUserName(), null, masked);
      //TODO 创建文件核心代码
      status = namesystem.startFile(src, perm, clientName, clientMachine,
          flag.get(), createParent, replication, blockSize, supportedVersions,
          cacheEntry != null);
    } finally {
      RetryCache.setState(cacheEntry, status != null, status);
    return status;
HdfsFileStatus startFile(String src, PermissionStatus permissions,
      String holder, String clientMachine, EnumSet<CreateFlag> flag,
      boolean createParent, short replication, long blockSize, 
      CryptoProtocolVersion[] supportedVersions, boolean logRetryCache)
      throws AccessControlException, SafeModeException,
      FileAlreadyExistsException, UnresolvedLinkException,
      FileNotFoundException, ParentNotDirectoryException, IOException {

    HdfsFileStatus status = null;
    try {
    	//TODO  重要
      status = startFileInt(src, permissions, holder, clientMachine, flag,
          createParent, replication, blockSize, supportedVersions,
    } catch (AccessControlException e) {
      logAuditEvent(false, "create", src);
      throw e;
    return status;
private HdfsFileStatus startFileInt(final String srcArg,
      PermissionStatus permissions, String holder, String clientMachine,
      EnumSet<CreateFlag> flag, boolean createParent, short replication,
      long blockSize, CryptoProtocolVersion[] supportedVersions,
      boolean logRetryCache)
      throws AccessControlException, SafeModeException,
      FileAlreadyExistsException, UnresolvedLinkException,
      FileNotFoundException, ParentNotDirectoryException, IOException {
    String src = srcArg;
    if (NameNode.stateChangeLog.isDebugEnabled()) {
      StringBuilder builder = new StringBuilder();
      builder.append("DIR* NameSystem.startFile: src=" + src
              + ", holder=" + holder
              + ", clientMachine=" + clientMachine
              + ", createParent=" + createParent
              + ", replication=" + replication
              + ", createFlag=" + flag.toString()
              + ", blockSize=" + blockSize);
      builder.append(", supportedVersions=");
      if (supportedVersions != null) {
      } else {
    if (!DFSUtil.isValidName(src)) {
      throw new InvalidPathException(src);
    blockManager.verifyReplication(src, replication, clientMachine);

    boolean skipSync = false;
    HdfsFileStatus stat = null;
    FSPermissionChecker pc = getPermissionChecker();
    if (blockSize < minBlockSize) {
      throw new IOException("Specified block size is less than configured" +
          " minimum value (" + DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY
          + "): " + blockSize + " < " + minBlockSize);
    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
    boolean create = flag.contains(CreateFlag.CREATE);
    boolean overwrite = flag.contains(CreateFlag.OVERWRITE);
    boolean isLazyPersist = flag.contains(CreateFlag.LAZY_PERSIST);

     * If the file is in an encryption zone, we optimistically create an
     * EDEK for the file by calling out to the configured KeyProvider.
     * Since this typically involves doing an RPC, we take the readLock
     * initially, then drop it to do the RPC.
     * Since the path can flip-flop between being in an encryption zone and not
     * in the meantime, we need to recheck the preconditions when we retake the
     * lock to do the create. If the preconditions are not met, we throw a
     * special RetryStartFileException to ask the DFSClient to try the create
     * again later.
    CryptoProtocolVersion protocolVersion = null;
    CipherSuite suite = null;
    String ezKeyName = null;
    EncryptedKeyVersion edek = null;

    if (provider != null) {
      try {
        src = dir.resolvePath(pc, src, pathComponents);
        INodesInPath iip = dir.getINodesInPath4Write(src);
        // Nothing to do if the path is not within an EZ
        final EncryptionZone zone = dir.getEZForPath(iip);
        if (zone != null) {
          protocolVersion = chooseProtocolVersion(zone, supportedVersions);
          suite = zone.getSuite();
          ezKeyName = zone.getKeyName();

              "Chose an UNKNOWN CipherSuite!");
      } finally {

          (suite == null && ezKeyName == null) ||
              (suite != null && ezKeyName != null),
          "Both suite and ezKeyName should both be null or not null");

      // Generate EDEK if necessary while not holding the lock
      edek = generateEncryptedDataEncryptionKey(ezKeyName);

    // Proceed with the create, using the computed cipher suite and 
    // generated EDEK
    BlocksMapUpdateInfo toRemoveBlocks = null;
    try {
      checkNameNodeSafeMode("Cannot create file" + src);
      try {
        src = dir.resolvePath(pc, src, pathComponents);
        final INodesInPath iip = dir.getINodesInPath4Write(src);
        //TODO 重要的代码
        toRemoveBlocks = startFileInternal(
            pc, iip, permissions, holder,
            clientMachine, create, overwrite,
            createParent, replication, blockSize,
            isLazyPersist, suite, protocolVersion, edek,
        stat = FSDirStatAndListingOp.getFileInfo(
            dir, src, false, FSDirectory.isReservedRawName(srcArg), true);
      } finally {
    } catch (StandbyException se) {
      skipSync = true;
      throw se;
    } finally {
      // There might be transactions logged while trying to recover the lease.
      // They need to be sync'ed even when an exception was thrown.
      if (!skipSync) {
        if (toRemoveBlocks != null) {

    logAuditEvent(true, "create", srcArg, null, stat);
    return stat;
private BlocksMapUpdateInfo startFileInternal(FSPermissionChecker pc, 
      INodesInPath iip, PermissionStatus permissions, String holder,
      String clientMachine, boolean create, boolean overwrite, 
      boolean createParent, short replication, long blockSize, 
      boolean isLazyPersist, CipherSuite suite, CryptoProtocolVersion version,
      EncryptedKeyVersion edek, boolean logRetryEntry)
      throws IOException {
    assert hasWriteLock();
    // Verify that the destination does not exist as a directory already.
    final INode inode = iip.getLastINode();
    final String src = iip.getPath();
    if (inode != null && inode.isDirectory()) {
      throw new FileAlreadyExistsException(src +
          " already exists as a directory");

    final INodeFile myFile = INodeFile.valueOf(inode, src, true);
    if (isPermissionEnabled) {
      if (overwrite && myFile != null) {
        dir.checkPathAccess(pc, iip, FsAction.WRITE);
       * To overwrite existing file, need to check 'w' permission 
       * of parent (equals to ancestor in this case)
      dir.checkAncestorAccess(pc, iip, FsAction.WRITE);
    if (!createParent) {
      dir.verifyParentDir(iip, src);

    FileEncryptionInfo feInfo = null;

    final EncryptionZone zone = dir.getEZForPath(iip);
    if (zone != null) {
      // The path is now within an EZ, but we're missing encryption parameters
      if (suite == null || edek == null) {
        throw new RetryStartFileException();
      // Path is within an EZ and we have provided encryption parameters.
      // Make sure that the generated EDEK matches the settings of the EZ.
      final String ezKeyName = zone.getKeyName();
      if (!ezKeyName.equals(edek.getEncryptionKeyName())) {
        throw new RetryStartFileException();
      feInfo = new FileEncryptionInfo(suite, version,
          ezKeyName, edek.getEncryptionKeyVersionName());

    try {
      BlocksMapUpdateInfo toRemoveBlocks = null;
      if (myFile == null) {
        if (!create) {
          throw new FileNotFoundException("Can't overwrite non-existent " +
              src + " for client " + clientMachine);
      } else {
        if (overwrite) {
          toRemoveBlocks = new BlocksMapUpdateInfo();
          List<INode> toRemoveINodes = new ChunkedArrayList<INode>();
          long ret = FSDirDeleteOp.delete(dir, iip, toRemoveBlocks,
                                          toRemoveINodes, now());
          if (ret >= 0) {
            iip = INodesInPath.replace(iip, iip.length() - 1, null);
            removeLeasesAndINodes(src, toRemoveINodes, true);
        } else {
          // If lease soft limit time is expired, recover the lease
              iip, src, holder, clientMachine, false);
          throw new FileAlreadyExistsException(src + " for client " +
              clientMachine + " already exists");

      INodeFile newNode = null;

      // Always do an implicit mkdirs for parent directory tree.
      //hadoop fs -put a.txt /usr/hive/warehouse/data/a.txt
      ///usr/hive/warehouse/data/ parent目录
      Map.Entry<INodesInPath, String> parent = FSDirMkdirOp
          .createAncestorDirectories(dir, iip, permissions);
      if (parent != null) {
    	  //TODO 往文件目录树里面添加INodeFile节点
          // dir就是FSDirectory(管理目录树)
        iip = dir.addFile(parent.getKey(), parent.getValue(), permissions,
            replication, blockSize, holder, clientMachine);
        newNode = iip != null ? iip.getLastINode().asFile() : null;

      if (newNode == null) {
        throw new IOException("Unable to add " + src +  " to namespace");
      //TODO 添加契约
          .getClientName(), src);

      // Set encryption attributes if necessary
      if (feInfo != null) {
        dir.setFileEncryptionInfo(src, feInfo);
        newNode = dir.getInode(newNode.getId()).asFile();

      setNewINodeStoragePolicy(newNode, iip, isLazyPersist);

      // record file record in log, record new generation stamp
      //TODO 把元数据同步到磁盘,具体细节看元数据管理流程
      getEditLog().logOpenFile(src, newNode, overwrite, logRetryEntry);
      if (NameNode.stateChangeLog.isDebugEnabled()) {
        NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: added " +
            src + " inode " + newNode.getId() + " " + holder);
      return toRemoveBlocks;
    } catch (IOException ie) {
      NameNode.stateChangeLog.warn("DIR* NameSystem.startFile: " + src + " " +
      throw ie;


public class LeaseManager {
  public static final Log LOG = LogFactory.getLog(LeaseManager.class);

  private final FSNamesystem fsnamesystem;

  private long softLimit = HdfsConstants.LEASE_SOFTLIMIT_PERIOD;
  private long hardLimit = HdfsConstants.LEASE_HARDLIMIT_PERIOD;

  // Used for handling lock-leases
  // Mapping: leaseHolder -> Lease
  private final SortedMap<String, Lease> leases = new TreeMap<String, Lease>();
  // Set of: Lease
  private final NavigableSet<Lease> sortedLeases = new TreeSet<Lease>();

  // Map path names to leases. It is protected by the sortedLeases lock.
  // The map stores pathnames in lexicographical order.
  // //底层就是红黑树,是可以实现排序
  private final SortedMap<String, Lease> sortedLeasesByPath = new TreeMap<String, Lease>();
synchronized Lease addLease(String holder, String src) {
	 //TODO 先查看这个契约是否已经存在。
    Lease lease = getLease(holder);
    if (lease == null) {
    	//TODO 如果没有创建一个契约
      lease = new Lease(holder);
     //TODO 存储到数据结构里面(可以排序的)
      leases.put(holder, lease);
    } else {
      //TODO 如果是第二次过来那就是续约了
    sortedLeasesByPath.put(src, lease);
    return lease;


synchronized void renewLease(String holder) {
  synchronized void renewLease(Lease lease) {
    if (lease != null) {


public void run() {
      //for while
      for(; shouldRunMonitor && fsnamesystem.isRunning(); ) {
        boolean needSync = false;
        try {
          try {
            if (!fsnamesystem.isInSafeMode()) {
              needSync = checkLeases();
          } finally {
            // lease reassignments should to be sync'ed.
            if (needSync) {
        } catch(InterruptedException ie) {
          if (LOG.isDebugEnabled()) {
            LOG.debug(name + " is interrupted", ie);


synchronized boolean checkLeases() {
    boolean needSync = false;
    assert fsnamesystem.hasWriteLock();
    Lease leaseToCheck = null;
    try {
      leaseToCheck = sortedLeases.first();
    } catch(NoSuchElementException e) {}

    while(leaseToCheck != null) {
    	//TODO 最老的契约是否过期
      if (!leaseToCheck.expiredHardLimit()) {

      LOG.info(leaseToCheck + " has expired hard limit");

      final List<String> removing = new ArrayList<String>();
      // need to create a copy of the oldest lease paths, because 
      // internalReleaseLease() removes paths corresponding to empty files,
      // i.e. it needs to modify the collection being iterated over
      // causing ConcurrentModificationException
      String[] leasePaths = new String[leaseToCheck.getPaths().size()];
      for(String p : leasePaths) {
        try {
          INodesInPath iip = fsnamesystem.getFSDirectory().getINodesInPath(p,
          boolean completed = fsnamesystem.internalReleaseLease(leaseToCheck, p,
              iip, HdfsServerConstants.NAMENODE_LEASE_HOLDER);
          if (LOG.isDebugEnabled()) {
            if (completed) {
              LOG.debug("Lease recovery for " + p + " is complete. File closed.");
            } else {
              LOG.debug("Started block recovery " + p + " lease " + leaseToCheck);
          // If a lease recovery happened, we need to sync later.
          if (!needSync && !completed) {
            needSync = true;
        } catch (IOException e) {
          LOG.error("Cannot release the path " + p + " in the lease "
              + leaseToCheck, e);

      for(String p : removing) {
        removeLease(leaseToCheck, p);
      //这儿就可以找第二老那个。 第三老,第四老的。
      leaseToCheck = sortedLeases.higher(leaseToCheck);

    try {
      if(leaseToCheck != sortedLeases.first()) {
        LOG.warn("Unable to release hard-limit expired lease: "
          + sortedLeases.first());
    } catch(NoSuchElementException e) {}
    return needSync;


//TODO 里面初始化了DataStreamer,DataStreamer是写数据流程里面重要的对象
			final DFSOutputStream out = new DFSOutputStream(dfsClient, src, stat, flag, progress, checksum,
			//TODO 里面启动了DataStreamer
private DFSOutputStream(DFSClient dfsClient, String src, HdfsFileStatus stat, EnumSet<CreateFlag> flag,
			Progressable progress, DataChecksum checksum, String[] favoredNodes) throws IOException {
		this(dfsClient, src, progress, stat, checksum);
		this.shouldSyncBlock = flag.contains(CreateFlag.SYNC_BLOCK);
         * TODO File -> Block  -> package -> chunk
         * TODO chunk  512 byte
         * TODO chunksum  4 byte
         * TODO package: 65536 byte
		computePacketChunkSize(dfsClient.getConf().writePacketSize, bytesPerChecksum);

        //TODO 类注释
	   // 每个packet 都是有序列号的,当这个服务器接收到一个完整的block(packet)时候,就会返回响应
		//TODO 创建了DataStreamer(可以去看一下这类的注释)
		streamer = new DataStreamer(stat, null);
		if (favoredNodes != null && favoredNodes.length != 0) {
public void run() {
			long lastPacket = Time.monotonicNow();
			TraceScope scope = NullScope.INSTANCE;
			while (!streamerClosed && dfsClient.clientRunning) {
				// if the Responder encountered an error, shutdown Responder
				//hasError= true
				if (hasError && response != null) {
					try {
						response = null;
					} catch (InterruptedException e) {
						DFSClient.LOG.warn("Caught exception ", e);

				DFSPacket one;
				try {
					// process datanode IO errors if any
					boolean doSleep = false;
					// true          ()
					if (hasError && (errorIndex >= 0 || restartingNodeIndex.get() >= 0)) {
						doSleep = processDatanodeError();

					synchronized (dataQueue) {
						// wait for a packet to be sent.
						long now = Time.monotonicNow();
						//TODO 第一次进来的时候,因为没有数据所以代码走的是这儿
						// dataQueue.size() == 0,把数据写到dateQueue队列
						while ((!streamerClosed && !hasError && dfsClient.clientRunning && dataQueue.size() == 0
								&& (stage != BlockConstructionStage.DATA_STREAMING
										|| stage == BlockConstructionStage.DATA_STREAMING
												&& now - lastPacket < dfsClient.getConf().socketTimeout / 2))
								|| doSleep) {
							long timeout = dfsClient.getConf().socketTimeout / 2 - (now - lastPacket);
							timeout = timeout <= 0 ? 1000 : timeout;
							timeout = (stage == BlockConstructionStage.DATA_STREAMING) ? timeout : 1000;
							try {
								//TODO 如果dataQueue里面没有数据,代码就会阻塞在这儿。
							} catch (InterruptedException e) {
								DFSClient.LOG.warn("Caught exception ", e);
							doSleep = false;
							now = Time.monotonicNow();
						if (streamerClosed || hasError || !dfsClient.clientRunning) {
						// get packet to be sent.
						if (dataQueue.isEmpty()) {
							one = createHeartbeatPacket();
							assert one != null;
						} else {
							//TODO 往队列里面取出来packet
							one = dataQueue.getFirst(); // regular data packet
							long parents[] = one.getTraceParents();
							if (parents.length > 0) {
								scope = Trace.startSpan("dataStreamer", new TraceInfo(0, parents[0]));
								// TODO: use setParents API once it's available from HTrace 3.2
								// scope = Trace.startSpan("dataStreamer", Sampler.ALWAYS);
								// scope.getSpan().setParents(parents);

					// get new block from namenode.
					 * 建立数据管道
					 * 向NameNode申请Block
					if (stage == BlockConstructionStage.PIPELINE_SETUP_CREATE) {
						if (DFSClient.LOG.isDebugEnabled()) {
							DFSClient.LOG.debug("Allocating new block");
						//TODO 步骤一:建立数据管道
						 * nextBlockOutputStream 这个方法里面完成了两个事:
						 * 1)向Namenode申请block
						 * 2) 建立数据管道
						//TODO 步骤二:启动了ResponseProcessor 用来监听我们一个packet发送是否成功
					} else if (stage == BlockConstructionStage.PIPELINE_SETUP_APPEND) {
						if (DFSClient.LOG.isDebugEnabled()) {
							DFSClient.LOG.debug("Append to block " + block);

					long lastByteOffsetInBlock = one.getLastByteOffsetBlock();
					if (lastByteOffsetInBlock > blockSize) {
						throw new IOException("BlockSize " + blockSize + " is smaller than data size. "
								+ " Offset of packet in block " + lastByteOffsetInBlock + " Aborting file " + src);

					if (one.isLastPacketInBlock()) {
						// wait for all data packets have been successfully acked
						synchronized (dataQueue) {
							while (!streamerClosed && !hasError && ackQueue.size() != 0 && dfsClient.clientRunning) {
								try {
									// wait for acks to arrive from datanodes
								} catch (InterruptedException e) {
									DFSClient.LOG.warn("Caught exception ", e);
						if (streamerClosed || hasError || !dfsClient.clientRunning) {
						stage = BlockConstructionStage.PIPELINE_CLOSE;

					// send the packet
					Span span = null;
					synchronized (dataQueue) {
						// move packet from dataQueue to ackQueue
						if (!one.isHeartbeatPacket()) {
							span = scope.detach();
							//TODO 步骤三:从dataQueue把要发送的这个packet移除初五
							//TODO 步骤四:然后往ackQueue里面添加这个packet

					if (DFSClient.LOG.isDebugEnabled()) {
						DFSClient.LOG.debug("DataStreamer block " + block + " sending packet " + one);

					// write out data to remote datanode
					TraceScope writeScope = Trace.startSpan("writeTo", span);
					try {
					} catch (IOException e) {
						// HDFS-3398 treat primary DN is down since client is unable to
						// write to primary DN. If a failed or restarting node has already
						// been recorded by the responder, the following call will have no
						// effect. Pipeline recovery can handle only one node error at a
						// time. If the primary node fails again during the recovery, it
						// will be taken out then.
						//PrimaryDatanode 指的是数据管道第一個datanode
						throw e;
					} finally {
					lastPacket = Time.monotonicNow();

					// update bytesSent
					long tmpBytesSent = one.getLastByteOffsetBlock();
					if (bytesSent < tmpBytesSent) {
						bytesSent = tmpBytesSent;

					if (streamerClosed || hasError || !dfsClient.clientRunning) {

					// Is this block full?
					if (one.isLastPacketInBlock()) {
						// wait for the close packet has been acked
						synchronized (dataQueue) {
							while (!streamerClosed && !hasError && ackQueue.size() != 0 && dfsClient.clientRunning) {
								dataQueue.wait(1000);// wait for acks to arrive from datanodes
						if (streamerClosed || hasError || !dfsClient.clientRunning) {

					if (progress != null) {

					// This is used by unit test to trigger race conditions.
					if (artificialSlowdown != 0 && dfsClient.clientRunning) {
				} catch (Throwable e) {
					// Log warning if there was a real error.
					if (restartingNodeIndex.get() == -1) {
						DFSClient.LOG.warn("DataStreamer Exception", e);
					if (e instanceof IOException) {
						setLastException((IOException) e);
					} else {
						setLastException(new IOException("DataStreamer Exception: ", e));
					hasError = true;
					if (errorIndex == -1 && restartingNodeIndex.get() == -1) {
						// Not a datanode issue
						streamerClosed = true;
				} finally {

继续之前的方法 开启契约

		 * 总结:
		 * 1) 往文件目录树里面添加了文件
		 * 2) 添加了契约管理
		 * 3) 启动了DataStreamer
		final DFSOutputStream result = DFSOutputStream.newStreamForCreate(this, src, masked, flag, createParent,
				replication, blockSize, progress, buffersize, dfsClientConf.createChecksum(checksumOpt),
		//TODO 开启契约
		beginFileLease(result.getFileId(), result);
private void beginFileLease(final long inodeId, final DFSOutputStream out) throws IOException {
		getLeaseRenewer().put(inodeId, out, this);
synchronized void put(final long inodeId, final DFSOutputStream out,
      final DFSClient dfsc) {
    if (dfsc.isClientRunning()) {
      if (!isRunning() || isRenewerExpired()) {
        //start a new deamon with a new id.
        final int id = ++currentId;
        daemon = new Daemon(new Runnable() {
          public void run() {
            try {
              if (LOG.isDebugEnabled()) {
                LOG.debug("Lease renewer daemon for " + clientsString()
                    + " with renew id " + id + " started");
              //LeaseRenewer  就是进行契约续约的
            } catch(InterruptedException e) {
              if (LOG.isDebugEnabled()) {
                    + " is interrupted.", e);
            } finally {
              synchronized(LeaseRenewer.this) {
              if (LOG.isDebugEnabled()) {
                LOG.debug("Lease renewer daemon for " + clientsString()
                    + " with renew id " + id + " exited");
          public String toString() {
            return String.valueOf(LeaseRenewer.this);
      dfsc.putFileBeingWritten(inodeId, out);
      emptyTime = Long.MAX_VALUE;
private void run(final int id) throws InterruptedException {
	  //for(int i=0;i< 10;i++)
    for(long lastRenewed = Time.monotonicNow(); !Thread.interrupted();Thread.sleep(getSleepPeriod())) {
    	//当前时间 -  上一次续约的时间
      final long elapsed = Time.monotonicNow() - lastRenewed;
      if (elapsed >= getRenewalTime()) {
        try {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Lease renewer daemon for " + clientsString()
                + " with renew id " + id + " executed");
          lastRenewed = Time.monotonicNow();
        } catch (SocketTimeoutException ie) {
          LOG.warn("Failed to renew lease for " + clientsString() + " for "
              + (elapsed/1000) + " seconds.  Aborting ...", ie);
          synchronized (this) {
            while (!dfsclients.isEmpty()) {
        } catch (IOException ie) {
          LOG.warn("Failed to renew lease for " + clientsString() + " for "
              + (elapsed/1000) + " seconds.  Will retry shortly ...", ie);

      synchronized(this) {
        if (id != currentId || isRenewerExpired()) {
          if (LOG.isDebugEnabled()) {
            if (id != currentId) {
              LOG.debug("Lease renewer daemon for " + clientsString()
                  + " with renew id " + id + " is not current");
            } else {
               LOG.debug("Lease renewer daemon for " + clientsString()
                  + " with renew id " + id + " expired");
          //no longer the current daemon or expired

        // if no clients are in running state or there is no more clients
        // registered with this renewer, stop the daemon after the grace
        // period.
        if (!clientsRunning() && emptyTime == Long.MAX_VALUE) {
          emptyTime = Time.monotonicNow();
private void renew() throws IOException {
    final List<DFSClient> copies;
    synchronized(this) {
      copies = new ArrayList<DFSClient>(dfsclients);
    //sort the client names for finding out repeated names.
    Collections.sort(copies, new Comparator<DFSClient>() {
      public int compare(final DFSClient left, final DFSClient right) {
        return left.getClientName().compareTo(right.getClientName());
    String previousName = "";
    for(int i = 0; i < copies.size(); i++) {
      final DFSClient c = copies.get(i);
      //skip if current client name is the same as the previous name.
      if (!c.getClientName().equals(previousName)) {
        if (!c.renewLease()) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Did not renew lease for client " +
        previousName = c.getClientName();
        if (LOG.isDebugEnabled()) {
          LOG.debug("Lease renewed for client " + previousName);
boolean renewLease() throws IOException {
		if (clientRunning && !isFilesBeingWrittenEmpty()) {
			try {
				return true;
			} catch (IOException e) {
				// Abort if the lease has already expired.
				final long elapsed = Time.monotonicNow() - getLastLeaseRenewal();
				if (elapsed > HdfsConstants.LEASE_HARDLIMIT_PERIOD) {
					LOG.warn("Failed to renew lease for " + clientName + " for " + (elapsed / 1000)
							+ " seconds (>= hard-limit =" + (HdfsConstants.LEASE_HARDLIMIT_PERIOD / 1000)
							+ " seconds.) " + "Closing all files being written ...", e);
				} else {
					// Let the lease renewer handle it and retry.
					throw e;
		return false;