除了对副本进行读写之外,副本管理器还有一个重要的功能,就是管理副本和对应的分区。ReplicaManager 管理它们的方式,ReplicaManager 通过直接操作分区对象来间接管理下属的副本对象。
管理下辖的分区和副本对象的主要方式,就是要确定在它保存的这些副本中,哪些是 Leader 副本、哪些是 Follower 副本。这些划分可不是一成不变的,而是随着时间的推移不断变化的。这些变更是通过 Controller 给 Broker 发送 LeaderAndIsrRequest 请求来实现的。当 Broker 端收到这类请求后,会调用副本管理器的 becomeLeaderOrFollower 方法来处理,并依次执行“成为 Leader 副本”和“成为 Follower 副本”的逻辑。下面看看这个方法。
def becomeLeaderOrFollower(correlationId: Int,
leaderAndIsrRequest: LeaderAndIsrRequest,
onLeadershipChange: (Iterable[Partition], Iterable[Partition]) => Unit): LeaderAndIsrResponse = {
if (stateChangeLogger.isTraceEnabled) {
leaderAndIsrRequest.partitionStates.asScala.foreach { partitionState =>
stateChangeLogger.trace(s"Received LeaderAndIsr request $partitionState " +
s"correlation id $correlationId from controller ${leaderAndIsrRequest.controllerId} " +
s"epoch ${leaderAndIsrRequest.controllerEpoch}")
}
}
replicaStateChangeLock synchronized {
// 如果LeaderAndIsrRequest携带的Controller Epoch
// 小于当前Controller的Epoch值
if (leaderAndIsrRequest.controllerEpoch < controllerEpoch) {
stateChangeLogger.warn(s"Ignoring LeaderAndIsr request from controller ${leaderAndIsrRequest.controllerId} with " +
s"correlation id $correlationId since its controller epoch ${leaderAndIsrRequest.controllerEpoch} is old. " +
s"Latest known controller epoch is $controllerEpoch")
// 说明Controller已经易主,抛出相应异常
leaderAndIsrRequest.getErrorResponse(0, Errors.STALE_CONTROLLER_EPOCH.exception)
} else {
// 第一阶段:主要做的事情就是创建新分区、更新 Controller Epoch 和校验分区 Leader Epoch。
val responseMap = new mutable.HashMap[TopicPartition, Errors]
val controllerId = leaderAndIsrRequest.controllerId
// 更新当前Controller Epoch值
controllerEpoch = leaderAndIsrRequest.controllerEpoch
// First check partition's leader epoch
val partitionStates = new mutable.HashMap[Partition, LeaderAndIsrPartitionState]()
val updatedPartitions = new mutable.HashSet[Partition]
// 遍历LeaderAndIsrRequest请求中的所有分区
leaderAndIsrRequest.partitionStates.asScala.foreach { partitionState =>
val topicPartition = new TopicPartition(partitionState.topicName, partitionState.partitionIndex)
// 从allPartitions中获取对应分区对象
val partitionOpt = getPartition(topicPartition) match {
// 如果是Offline状态
case HostedPartition.Offline =>
stateChangeLogger.warn(s"Ignoring LeaderAndIsr request from " +
s"controller $controllerId with correlation id $correlationId " +
s"epoch $controllerEpoch for partition $topicPartition as the local replica for the " +
"partition is in an offline log directory")
// 添加对象异常到Response,并设置分区对象变量partitionOpt=None
responseMap.put(topicPartition, Errors.KAFKA_STORAGE_ERROR)
None
// 如果是Online状态,直接赋值partitionOpt即可
case HostedPartition.Online(partition) =>
updatedPartitions.add(partition)
Some(partition)
// 如果是None状态,则表示没有找到分区对象
// 那么创建新的分区对象将,新创建的分区对象加入到allPartitions统一管理
// 然后赋值partitionOpt字段
case HostedPartition.None =>
val partition = Partition(topicPartition, time, this)
allPartitions.putIfNotExists(topicPartition, HostedPartition.Online(partition))
updatedPartitions.add(partition)
Some(partition)
}
// 检查分区的Leader Epoch值
partitionOpt.foreach { partition =>
val currentLeaderEpoch = partition.getLeaderEpoch
val requestLeaderEpoch = partitionState.leaderEpoch
if (requestLeaderEpoch > currentLeaderEpoch) {
// If the leader epoch is valid record the epoch of the controller that made the leadership decision.
// This is useful while updating the isr to maintain the decision maker controller's epoch in the zookeeper path
if (partitionState.replicas.contains(localBrokerId))
partitionStates.put(partition, partitionState)
else {
stateChangeLogger.warn(s"Ignoring LeaderAndIsr request from controller $controllerId with " +
s"correlation id $correlationId epoch $controllerEpoch for partition $topicPartition as itself is not " +
s"in assigned replica list ${partitionState.replicas.asScala.mkString(",")}")
responseMap.put(topicPartition, Errors.UNKNOWN_TOPIC_OR_PARTITION)
}
} else if (requestLeaderEpoch < currentLeaderEpoch) {
stateChangeLogger.warn(s"Ignoring LeaderAndIsr request from " +
s"controller $controllerId with correlation id $correlationId " +
s"epoch $controllerEpoch for partition $topicPartition since its associated " +
s"leader epoch $requestLeaderEpoch is smaller than the current " +
s"leader epoch $currentLeaderEpoch")
responseMap.put(topicPartition, Errors.STALE_CONTROLLER_EPOCH)
} else {
stateChangeLogger.debug(s"Ignoring LeaderAndIsr request from " +
s"controller $controllerId with correlation id $correlationId " +
s"epoch $controllerEpoch for partition $topicPartition since its associated " +
s"leader epoch $requestLeaderEpoch matches the current leader epoch")
responseMap.put(topicPartition, Errors.STALE_CONTROLLER_EPOCH)
}
}
}
// 第二阶段,开始执行 Broker 成为 Leader 副本和 Follower 副本的逻辑
// 确定Broker上副本是哪些分区的Leader副本
val partitionsTobeLeader = partitionStates.filter { case (_, partitionState) =>
partitionState.leader == localBrokerId
}
// 确定Broker上副本是哪些分区的Follower副本
val partitionsToBeFollower = partitionStates -- partitionsTobeLeader.keys
val highWatermarkCheckpoints = new LazyOffsetCheckpoints(this.highWatermarkCheckpoints)
val partitionsBecomeLeader = if (partitionsTobeLeader.nonEmpty)
// 调用makeLeaders方法为partitionsToBeLeader所有分区
// 执行"成为Leader副本"的逻辑
makeLeaders(controllerId, controllerEpoch, partitionsTobeLeader, correlationId, responseMap,
highWatermarkCheckpoints)
else
Set.empty[Partition]
val partitionsBecomeFollower = if (partitionsToBeFollower.nonEmpty)
// 调用makeFollowers方法为令partitionsToBeFollower所有分区
// 执行"成为Follower副本"的逻辑
makeFollowers(controllerId, controllerEpoch, partitionsToBeFollower, correlationId, responseMap,
highWatermarkCheckpoints)
else
Set.empty[Partition]
/*
* KAFKA-8392
* For topic partitions of which the broker is no longer a leader, delete metrics related to
* those topics. Note that this means the broker stops being either a replica or a leader of
* partitions of said topics
*/
val leaderTopicSet = leaderPartitionsIterator.map(_.topic).toSet
val followerTopicSet = partitionsBecomeFollower.map(_.topic).toSet
// 对于当前Broker成为Follower副本的主题
// 移除它们之前的Leader副本监控指标
followerTopicSet.diff(leaderTopicSet).foreach(brokerTopicStats.removeOldLeaderMetrics)
// remove metrics for brokers which are not followers of a topic
// 对于当前Broker成为Leader副本的主题
// 移除它们之前的Follower副本监控指
leaderTopicSet.diff(followerTopicSet).foreach(brokerTopicStats.removeOldFollowerMetrics)
// 如果有分区的本地日志为空,说明底层的日志路径不可用
// 标记该分区为Offline状态
leaderAndIsrRequest.partitionStates.asScala.foreach { partitionState =>
val topicPartition = new TopicPartition(partitionState.topicName, partitionState.partitionIndex)
/*
* If there is offline log directory, a Partition object may have been created by getOrCreatePartition()
* before getOrCreateReplica() failed to create local replica due to KafkaStorageException.
* In this case ReplicaManager.allPartitions will map this topic-partition to an empty Partition object.
* we need to map this topic-partition to OfflinePartition instead.
*/
if (localLog(topicPartition).isEmpty)
markPartitionOffline(topicPartition)
}
// we initialize highwatermark thread after the first leaderisrrequest. This ensures that all the partitions
// have been completely populated before starting the checkpointing there by avoiding weird race conditions
//第三部分的代码,构造 Response 对象
// 启动高水位检查点专属线程
// 定期将Broker上所有非Offline分区的高水位值写入到检查点文件
startHighWatermarkCheckPointThread()
val futureReplicasAndInitialOffset = new mutable.HashMap[TopicPartition, InitialFetchState]
for (partition <- updatedPartitions) {
val topicPartition = partition.topicPartition
if (logManager.getLog(topicPartition, isFuture = true).isDefined) {
partition.log.foreach { log =>
val leader = BrokerEndPoint(config.brokerId, "localhost", -1)
// Add future replica to partition's map
partition.createLogIfNotExists(Request.FutureLocalReplicaId, isNew = false, isFutureReplica = true,
highWatermarkCheckpoints)
// pause cleaning for partitions that are being moved and start ReplicaAlterDirThread to move
// replica from source dir to destination dir
logManager.abortAndPauseCleaning(topicPartition)
futureReplicasAndInitialOffset.put(topicPartition, InitialFetchState(leader,
partition.getLeaderEpoch, log.highWatermark))
}
}
}
// 添加日志路径数据迁移线程
replicaAlterLogDirsManager.addFetcherForPartitions(futureReplicasAndInitialOffset)
// 关闭空闲副本拉取线程
replicaFetcherManager.shutdownIdleFetcherThreads()
replicaAlterLogDirsManager.shutdownIdleFetcherThreads()
// 执行Leader变更之后的回调逻辑,回调逻辑,实际上只是对 Kafka 两个内部主题(__consumer_offsets 和 __transaction_state)有用,其他主题一概不适用。
onLeadershipChange(partitionsBecomeLeader, partitionsBecomeFollower)
// 构造LeaderAndIsrRequest请求的Response并返回
val responsePartitions = responseMap.iterator.map { case (tp, error) =>
new LeaderAndIsrPartitionError()
.setTopicName(tp.topic)
.setPartitionIndex(tp.partition)
.setErrorCode(error.code)
}.toBuffer
new LeaderAndIsrResponse(new LeaderAndIsrResponseData()
.setErrorCode(Errors.NONE.code)
.setPartitionErrors(responsePartitions.asJava))
}
}
}
下面会调用makeLeaders和makeFollowers方法进行角色确认。
makeLeaders方法
让当前 Broker 成为给定一组分区的 Leader,也就是让当前 Broker 下该分区的副本成为 Leader 副本。
controllerEpoch: Int, // Controller Epoch值,可以认为是Controller版本号
partitionStates: Map[Partition, LeaderAndIsrPartitionState], // LeaderAndIsrRequest请求中携带的分区信息
correlationId: Int, // 请求的Correlation字段,只用于日志调试
responseMap: mutable.Map[TopicPartition, Errors], // 按照主题分区分组的异常错误集合
highWatermarkCheckpoints: OffsetCheckpoints // 操作磁盘上高水位检查点文件的工具类
): Set[Partition] = {
partitionStates.keys.foreach { partition =>
stateChangeLogger.trace(s"Handling LeaderAndIsr request correlationId $correlationId from " +
s"controller $controllerId epoch $controllerEpoch starting the become-leader transition for " +
s"partition ${partition.topicPartition}")
}
// 使用Errors.NONE初始化ResponseMap
for (partition <- partitionStates.keys)
responseMap.put(partition.topicPartition, Errors.NONE)
val partitionsToMakeLeaders = mutable.Set[Partition]()
try {
// First stop fetchers for all the partitions
// 停止消息拉取
replicaFetcherManager.removeFetcherForPartitions(partitionStates.keySet.map(_.topicPartition))
// Update the partition information to be the leader
partitionStates.foreach { case (partition, partitionState) =>
try {
// 更新指定分区的Leader分区信息
if (partition.makeLeader(controllerId, partitionState, correlationId, highWatermarkCheckpoints)) {
partitionsToMakeLeaders += partition
stateChangeLogger.trace(s"Stopped fetchers as part of become-leader request from " +
s"controller $controllerId epoch $controllerEpoch with correlation id $correlationId for partition ${partition.topicPartition} " +
s"(last update controller epoch ${partitionState.controllerEpoch})")
} else
stateChangeLogger.info(s"Skipped the become-leader state change after marking its " +
s"partition as leader with correlation id $correlationId from controller $controllerId epoch $controllerEpoch for " +
s"partition ${partition.topicPartition} (last update controller epoch ${partitionState.controllerEpoch}) " +
s"since it is already the leader for the partition.")
} catch {
case e: KafkaStorageException =>
stateChangeLogger.error(s"Skipped the become-leader state change with " +
s"correlation id $correlationId from controller $controllerId epoch $controllerEpoch for partition ${partition.topicPartition} " +
s"(last update controller epoch ${partitionState.controllerEpoch}) since " +
s"the replica for the partition is offline due to disk error $e")
val dirOpt = getLogDir(partition.topicPartition)
error(s"Error while making broker the leader for partition $partition in dir $dirOpt", e)
// 把KAFKA_SOTRAGE_ERRROR异常封装到Response中
responseMap.put(partition.topicPartition, Errors.KAFKA_STORAGE_ERROR)
}
}
} catch {
case e: Throwable =>
partitionStates.keys.foreach { partition =>
stateChangeLogger.error(s"Error while processing LeaderAndIsr request correlationId $correlationId received " +
s"from controller $controllerId epoch $controllerEpoch for partition ${partition.topicPartition}", e)
}
// Re-throw the exception for it to be caught in KafkaApis
throw e
}
partitionStates.keys.foreach { partition =>
stateChangeLogger.trace(s"Completed LeaderAndIsr request correlationId $correlationId from controller $controllerId " +
s"epoch $controllerEpoch for the become-leader transition for partition ${partition.topicPartition}")
}
partitionsToMakeLeaders
}
partition#makeLeader方法
RepliacaManager的makeleader方法它最后调用的是partition的makeLeader方法:
def makeLeader(controllerId: Int,
partitionState: LeaderAndIsrPartitionState,
correlationId: Int,
highWatermarkCheckpoints: OffsetCheckpoints): Boolean = {
val (leaderHWIncremented, isNewLeader) = inWriteLock(leaderIsrUpdateLock) {
// record the epoch of the controller that made the leadership decision. This is useful while updating the isr
// to maintain the decision maker controller's epoch in the zookeeper path
// 更新本地记录的 controller 的年代信息
controllerEpoch = partitionState.controllerEpoch
// 获取/创建请求信息中 AR 和 ISR 集合中所有副本对应的 Replica 对象
updateAssignmentAndIsr(
assignment = partitionState.replicas.asScala.iterator.map(_.toInt).toSeq,
isr = partitionState.isr.asScala.iterator.map(_.toInt).toSet
)
// 创建对应的日志对象
createLogIfNotExists(localBrokerId, partitionState.isNew, isFutureReplica = false, highWatermarkCheckpoints)
val leaderLog = localLogOrException
val leaderEpochStartOffset = leaderLog.logEndOffset
info(s"$topicPartition starts at leader epoch ${partitionState.leaderEpoch} from " +
s"offset $leaderEpochStartOffset with high watermark ${leaderLog.highWatermark}. " +
s"Previous leader epoch was $leaderEpoch.")
//We cache the leader epoch here, persisting it only if it's local (hence having a log dir)
// 更新本地记录的分区 leader 副本相关信息
leaderEpoch = partitionState.leaderEpoch // 更新 leader 副本的年代信息
leaderEpochStartOffsetOpt = Some(leaderEpochStartOffset)
zkVersion = partitionState.zkVersion // 更新 ZK 的版本信息
// In the case of successive leader elections in a short time period, a follower may have
// entries in its log from a later epoch than any entry in the new leader's log. In order
// to ensure that these followers can truncate to the right offset, we must cache the new
// leader epoch and the start offset since it should be larger than any epoch that a follower
// would try to query.
// 重设远端 Follower 副本的 LEO 值。远端 Follower 副本是指保存在 Leader 副本本地内存中的一组 Follower 副本集合,
// 在代码中用字段 remoteReplicas 来表征。ReplicaManager 在处理 FETCH 请求时,会更新 remoteReplicas 中副本对象的 LEO 值。
// 同时,Leader 副本会将自己更新后的 LEO 值与 remoteReplicas 中副本的 LEO 值进行比较,来决定是否“抬高”高水位值。
leaderLog.maybeAssignEpochStartOffset(leaderEpoch, leaderEpochStartOffset)
// 检测分区 leader 副本是否发生变化
val isNewLeader = !isLeader
val curTimeMs = time.milliseconds
// initialize lastCaughtUpTime of replicas as well as their lastFetchTimeMs and lastFetchLeaderLogEndOffset.
// 遍历所有的 follower 副本,更新对应副本的相关时间戳信息
remoteReplicas.foreach { replica =>
// 更新远端副本的最后一次跟上leader的时间
val lastCaughtUpTimeMs = if (inSyncReplicaIds.contains(replica.brokerId)) curTimeMs else 0L
replica.resetLastCaughtUpTime(leaderEpochStartOffset, curTimeMs, lastCaughtUpTimeMs)
}
// 如果当前 leader 是新选举出来的,则修正 leader 副本的 HW 值,并重置本地缓存的所有远程副本的相关信息
if (isNewLeader) {
// mark local replica as the leader after converting hw
leaderReplicaIdOpt = Some(localBrokerId)
// reset log end offset for remote replicas
remoteReplicas.foreach { replica =>
replica.updateFetchState(
followerFetchOffsetMetadata = LogOffsetMetadata.UnknownOffsetMetadata,
followerStartOffset = Log.UnknownOffset,
followerFetchTimeMs = 0L,
leaderEndOffset = Log.UnknownOffset,
lastSentHighwatermark = 0L)
}
}
// we may need to increment high watermark since ISR could be down to 1
// 尝试后移 leader 副本的 HW 值
(maybeIncrementLeaderHW(leaderLog), isNewLeader)
}
// some delayed operations may be unblocked after HW changed
if (leaderHWIncremented)
// 如果 leader 副本的 HW 值增加了,则尝试执行监听当前 topic 分区的 DelayedFetch 和 DelayedProduce 任务
tryCompleteDelayedRequests()
isNewLeader
}
startHighWatermarkCheckPointThread
上面还有处代码是startHighWatermarkCheckPointThread,即启动定时任务 highwatermark-checkpoint,该任务周期性将当前 broker 节点管理的每个 topic 分区的 HW 值更新到对应 log 目录下的 replication-offset-checkpoint 文件中。它的核心逻辑如下:
def checkpointHighWatermarks(): Unit = {
// 获取到所有分区
val localLogs = nonOfflinePartitionsIterator.flatMap { partition =>
val logsList: mutable.Set[Log] = mutable.Set()
partition.log.foreach(logsList.add)
partition.futureLog.foreach(logsList.add)
logsList
}.toBuffer
// 分区目录筛选
val logsByDir = localLogs.groupBy(_.dir.getParent)
// 遍历将位于相同 log 目录下的分区 HW 值,写入到对应的 replication-offset-checkpoint 文件中
for ((dir, logs) <- logsByDir) {
// 获取每个 topic 分区对应的 HW 值
val hwms = logs.map(log => log.topicPartition -> log.highWatermark).toMap
try {
// 更新对应 log 目录下的 replication-offset-checkpoint 文件
highWatermarkCheckpoints.get(dir).foreach(_.write(hwms))
} catch {
case e: KafkaStorageException =>
error(s"Error while writing to highwatermark file in directory $dir", e)
}
}
}
makeFollowers 方法
将当前 Broker 配置成指定分区的 Follower 副本
private def makeFollowers(controllerId: Int, // Controller所在Broker的Id
controllerEpoch: Int, // Controller Epoch值
partitionStates: Map[Partition, LeaderAndIsrPartitionState], // 当前Broker是Follower副本的所有分区的详细信息
correlationId: Int, // 连接请求与响应的关联字段
responseMap: mutable.Map[TopicPartition, Errors], // 封装LeaderAndIsrRequest请求处理结果的字段
highWatermarkCheckpoints: OffsetCheckpoints // 操作高水位检查点文件的工具类
) : Set[Partition] = {
partitionStates.foreach { case (partition, partitionState) =>
stateChangeLogger.trace(s"Handling LeaderAndIsr request correlationId $correlationId from controller $controllerId " +
s"epoch $controllerEpoch starting the become-follower transition for partition ${partition.topicPartition} with leader " +
s"${partitionState.leader}")
}
// 第一部分:遍历partitionStates所有分区,然后执行“成为 Follower”的操作
for (partition <- partitionStates.keys)
// 将所有分区的处理结果的状态初始化为Errors.NONE
responseMap.put(partition.topicPartition, Errors.NONE)
val partitionsToMakeFollower: mutable.Set[Partition] = mutable.Set()
try {
// TODO: Delete leaders from LeaderAndIsrRequest
// 遍历partitionStates所有分区
partitionStates.foreach { case (partition, partitionState) =>
// 在元数据缓存中找到Leader Broke对象
val newLeaderBrokerId = partitionState.leader
try {
// 在元数据缓存中找到Leader Broke对象
metadataCache.getAliveBrokers.find(_.id == newLeaderBrokerId) match {
// Only change partition state when the leader is available
// 如果Leader确实存在
case Some(_) =>
if (partition.makeFollower(controllerId, partitionState, correlationId, highWatermarkCheckpoints))
// 如果配置成功,将该分区加入到结果返回集中
partitionsToMakeFollower += partition
else // 如果失败,打印错误日志
stateChangeLogger.info(s"Skipped the become-follower state change after marking its partition as " +
s"follower with correlation id $correlationId from controller $controllerId epoch $controllerEpoch " +
s"for partition ${partition.topicPartition} (last update " +
s"controller epoch ${partitionState.controllerEpoch}) " +
s"since the new leader $newLeaderBrokerId is the same as the old leader")
case None => // 如果Leader不存在
// The leader broker should always be present in the metadata cache.
// If not, we should record the error message and abort the transition process for this partition
stateChangeLogger.error(s"Received LeaderAndIsrRequest with correlation id $correlationId from " +
s"controller $controllerId epoch $controllerEpoch for partition ${partition.topicPartition} " +
s"(last update controller epoch ${partitionState.controllerEpoch}) " +
s"but cannot become follower since the new leader $newLeaderBrokerId is unavailable.")
// Create the local replica even if the leader is unavailable. This is required to ensure that we include
// the partition's high watermark in the checkpoint file (see KAFKA-1647)
// 依然创建出分区Follower副本的日志对象
partition.createLogIfNotExists(localBrokerId, isNew = partitionState.isNew, isFutureReplica = false,
highWatermarkCheckpoints)
}
} catch {
case e: KafkaStorageException =>
stateChangeLogger.error(s"Skipped the become-follower state change with correlation id $correlationId from " +
s"controller $controllerId epoch $controllerEpoch for partition ${partition.topicPartition} " +
s"(last update controller epoch ${partitionState.controllerEpoch}) with leader " +
s"$newLeaderBrokerId since the replica for the partition is offline due to disk error $e")
val dirOpt = getLogDir(partition.topicPartition)
error(s"Error while making broker the follower for partition $partition with leader " +
s"$newLeaderBrokerId in dir $dirOpt", e)
responseMap.put(partition.topicPartition, Errors.KAFKA_STORAGE_ERROR)
}
}
// 第二部分:执行其他动作,主要包括重建 Fetcher 线程、完成延时请求等。
// 移除现有Fetcher线程
replicaFetcherManager.removeFetcherForPartitions(partitionsToMakeFollower.map(_.topicPartition))
partitionsToMakeFollower.foreach { partition =>
stateChangeLogger.trace(s"Stopped fetchers as part of become-follower request from controller $controllerId " +
s"epoch $controllerEpoch with correlation id $correlationId for partition ${partition.topicPartition} with leader " +
s"${partitionStates(partition).leader}")
}
// 尝试完成延迟请求
partitionsToMakeFollower.foreach { partition =>
completeDelayedFetchOrProduceRequests(partition.topicPartition)
}
partitionsToMakeFollower.foreach { partition =>
stateChangeLogger.trace(s"Truncated logs and checkpointed recovery boundaries for partition " +
s"${partition.topicPartition} as part of become-follower request with correlation id $correlationId from " +
s"controller $controllerId epoch $controllerEpoch with leader ${partitionStates(partition).leader}")
}
if (isShuttingDown.get()) {
partitionsToMakeFollower.foreach { partition =>
stateChangeLogger.trace(s"Skipped the adding-fetcher step of the become-follower state " +
s"change with correlation id $correlationId from controller $controllerId epoch $controllerEpoch for " +
s"partition ${partition.topicPartition} with leader ${partitionStates(partition).leader} " +
"since it is shutting down")
}
} else {
// we do not need to check if the leader exists again since this has been done at the beginning of this process
// 为需要将当前Broker设置为Follower副本的分区
val partitionsToMakeFollowerWithLeaderAndOffset = partitionsToMakeFollower.map { partition =>
// 确定Leader Broker和起始读取位移值fetchOffset
val leader = metadataCache.getAliveBrokers.find(_.id == partition.leaderReplicaIdOpt.get).get
.brokerEndPoint(config.interBrokerListenerName)
val fetchOffset = partition.localLogOrException.highWatermark
partition.topicPartition -> InitialFetchState(leader, partition.getLeaderEpoch, fetchOffset)
}.toMap
// 使用上一步确定的Leader Broker和fetchOffset添加新的Fetcher线程
replicaFetcherManager.addFetcherForPartitions(partitionsToMakeFollowerWithLeaderAndOffset)
partitionsToMakeFollowerWithLeaderAndOffset.foreach { case (partition, initialFetchState) =>
stateChangeLogger.trace(s"Started fetcher to new leader as part of become-follower " +
s"request from controller $controllerId epoch $controllerEpoch with correlation id $correlationId for " +
s"partition $partition with leader ${initialFetchState.leader}")
}
}
} catch {
case e: Throwable =>
stateChangeLogger.error(s"Error while processing LeaderAndIsr request with correlationId $correlationId " +
s"received from controller $controllerId epoch $controllerEpoch", e)
// Re-throw the exception for it to be caught in KafkaApis
throw e
}
partitionStates.keys.foreach { partition =>
stateChangeLogger.trace(s"Completed LeaderAndIsr request correlationId $correlationId from controller $controllerId " +
s"epoch $controllerEpoch for the become-follower transition for partition ${partition.topicPartition} with leader " +
s"${partitionStates(partition).leader}")
}
// 返回需要将当前Broker设置为Follower副本的分区列表
partitionsToMakeFollower
}
partition#makeFollower方法
它关键是调用了partition的makeFollower方法:
def makeFollower(controllerId: Int,
partitionState: LeaderAndIsrPartitionState,
correlationId: Int,
highWatermarkCheckpoints: OffsetCheckpoints): Boolean = {
inWriteLock(leaderIsrUpdateLock) {
val newLeaderBrokerId = partitionState.leader
val oldLeaderEpoch = leaderEpoch
// record the epoch of the controller that made the leadership decision. This is useful while updating the isr
// to maintain the decision maker controller's epoch in the zookeeper path
// 更新 Controller Epoch 值
controllerEpoch = partitionState.controllerEpoch
// 保存副本列表(Assigned Replicas,AR)和清空 ISR
updateAssignmentAndIsr(
assignment = partitionState.replicas.asScala.iterator.map(_.toInt).toSeq,
isr = Set.empty[Int]
)
// 创建日志对象
createLogIfNotExists(localBrokerId, partitionState.isNew, isFutureReplica = false, highWatermarkCheckpoints)
leaderEpoch = partitionState.leaderEpoch // 更新 leader 副本的年代信息
leaderEpochStartOffsetOpt = None
zkVersion = partitionState.zkVersion // 更新 ZK 的版本信息
// 重设 Leader 副本的 Broker ID。
if (leaderReplicaIdOpt.contains(newLeaderBrokerId) && leaderEpoch == oldLeaderEpoch) {
false
} else {
// 发生变化,更新本地记录的分区 leader 副本的 ID
leaderReplicaIdOpt = Some(newLeaderBrokerId)
true
}
}
}
addFetcherForPartitions
其中makeFollower还有一步是addFetcherForPartitions,即添加新的Fetcher线程,它的逻辑如下
def addFetcherForPartitions(partitionAndOffsets: Map[TopicPartition, InitialFetchState]): Unit = {
lock synchronized {
// 由分区所属的 topic 和分区编号计算得到对应的 fetcher 线程 ID,并与 broker 的网络位置信息组成 key,然后按 key 进行分组,
// 后面会为每组分配一个 fetcher 线程,每个线程只连接一个 broker,可以同时为组内多个分区的 follower 副本执行同步操作。
val partitionsPerFetcher = partitionAndOffsets.groupBy { case (topicPartition, brokerAndInitialFetchOffset) =>
BrokerAndFetcherId(brokerAndInitialFetchOffset.leader, getFetcherId(topicPartition))
}
def addAndStartFetcherThread(brokerAndFetcherId: BrokerAndFetcherId,
brokerIdAndFetcherId: BrokerIdAndFetcherId): T = {
val fetcherThread = createFetcherThread(brokerAndFetcherId.fetcherId, brokerAndFetcherId.broker)
// 创建 ReplicaFetcherThread 线程对象,并记录到 fetcherThreadMap 集合中
fetcherThreadMap.put(brokerIdAndFetcherId, fetcherThread)
fetcherThread.start()
// 启动线程
fetcherThread
}
// 启动所有的的 fetcher 线程,如果对应线程不存在,则创建并启动
for ((brokerAndFetcherId, initialFetchOffsets) <- partitionsPerFetcher) {
val brokerIdAndFetcherId = BrokerIdAndFetcherId(brokerAndFetcherId.broker.id, brokerAndFetcherId.fetcherId)
val fetcherThread = fetcherThreadMap.get(brokerIdAndFetcherId) match {
case Some(currentFetcherThread) if currentFetcherThread.sourceBroker == brokerAndFetcherId.broker =>
// reuse the fetcher thread
currentFetcherThread
case Some(f) =>
f.shutdown()
addAndStartFetcherThread(brokerAndFetcherId, brokerIdAndFetcherId)
case None =>
addAndStartFetcherThread(brokerAndFetcherId, brokerIdAndFetcherId)
}
val initialOffsetAndEpochs = initialFetchOffsets.map { case (tp, brokerAndInitOffset) =>
tp -> OffsetAndEpoch(brokerAndInitOffset.initOffset, brokerAndInitOffset.currentLeaderEpoch)
}
// 将 topic 分区和同步起始位置传递给 fetcher 线程,并唤醒 fetcher 线程开始同步
addPartitionsToFetcherThread(fetcherThread, initialOffsetAndEpochs)
}
}
}