
了解了 KafkaProducer 的字段定义和对象的构造过程之后,下面正式开始对消息收集的过程进行分析,相关实现位于 KafkaProducer#send 方法中:

public Future<RecordMetadata> send(ProducerRecord<K, V> record, Callback callback) {
        // intercept the record, which can be potentially modified; this method does not throw exceptions
        // 遍历注册的拦截器对待发送的消息执行拦截修改
        ProducerRecord<K, V> interceptedRecord = this.interceptors.onSend(record);
        // 调用 doSend 方法开始发送消息
        return doSend(interceptedRecord, callback);

方法 ProducerInterceptor#doSend 用于收集消息的过程

private Future<RecordMetadata> doSend(ProducerRecord<K, V> record, Callback callback) {
        TopicPartition tp = null;
        try {
            // first make sure the metadata for the topic is available
            ClusterAndWaitTime clusterAndWaitTime;
            try {
                // 1. 获取 kafka 集群元数据信息,如果当前请求的是新 topic,或者指定的分区超过已知的分区范围,则会触发更新集群元数据信息
                clusterAndWaitTime = waitOnMetadata(record.topic(), record.partition(), maxBlockTimeMs);
            } catch (KafkaException e) {
                if (metadata.isClosed())
                    throw new KafkaException("Producer closed while send in progress", e);
                throw e;
            long remainingWaitMs = Math.max(0, maxBlockTimeMs - clusterAndWaitTime.waitedOnMetadataMs);
            Cluster cluster = clusterAndWaitTime.cluster;
            // 2 基于注册的序列化器对 key 执行序列化
            byte[] serializedKey;
            try {
                serializedKey = keySerializer.serialize(record.topic(), record.headers(), record.key());
            } catch (ClassCastException cce) {
                throw new SerializationException("Can't convert key of class " + record.key().getClass().getName() +
                        " to class " + producerConfig.getClass(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG).getName() +
                        " specified in key.serializer", cce);
            // 3. 基于注册的序列化器对 value 执行序列化
            byte[] serializedValue;
            try {
                serializedValue = valueSerializer.serialize(record.topic(), record.headers(), record.value());
            } catch (ClassCastException cce) {
                throw new SerializationException("Can't convert value of class " + record.value().getClass().getName() +
                        " to class " + producerConfig.getClass(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG).getName() +
                        " specified in value.serializer", cce);
            // 4. 为当前消息选择合适的分区,如果未明确指定的话,则基于注册的分区器为当前消息计算分区
            int partition = partition(record, serializedKey, serializedValue, cluster);
            // 消息投递的目标 topic 分区
            tp = new TopicPartition(record.topic(), partition);

            /* 5. 将消息追加到消息收集器(RecordAccumulator)中 */
            Header[] headers = record.headers().toArray();
            // 计算当前消息大小,并校验消息是否过大
            int serializedSize = AbstractRecords.estimateSizeInBytesUpperBound(apiVersions.maxUsableProduceMagic(),
                    compressionType, serializedKey, serializedValue, headers);
            // 如果未明确为当前消息指定时间戳,则设置为当前时间戳
            long timestamp = record.timestamp() == null ? time.milliseconds() : record.timestamp();
            if (log.isTraceEnabled()) {
                log.trace("Attempting to append record {} with callback {} to topic {} partition {}", record, callback, record.topic(), partition);
            // producer callback will make sure to call both 'callback' and interceptor callback
            Callback interceptCallback = new InterceptorCallback<>(callback, this.interceptors, tp);

            if (transactionManager != null && transactionManager.isTransactional()) {
            // 追加消息到收集器中
            RecordAccumulator.RecordAppendResult result = accumulator.append(tp, timestamp, serializedKey,
                    serializedValue, headers, interceptCallback, remainingWaitMs, true);
            if (result.abortForNewBatch) {
                int prevPartition = partition;
                partitioner.onNewBatch(record.topic(), cluster, prevPartition);
                partition = partition(record, serializedKey, serializedValue, cluster);
                tp = new TopicPartition(record.topic(), partition);
                if (log.isTraceEnabled()) {
                    log.trace("Retrying append due to new batch creation for topic {} partition {}. The old partition was {}", record.topic(), partition, prevPartition);
                // producer callback will make sure to call both 'callback' and interceptor callback
                interceptCallback = new InterceptorCallback<>(callback, this.interceptors, tp);

                result = accumulator.append(tp, timestamp, serializedKey,
                    serializedValue, headers, interceptCallback, remainingWaitMs, false);
            if (transactionManager != null && transactionManager.isTransactional())
            /* 6. 条件性唤醒消息发送线程 */
            if (result.batchIsFull || result.newBatchCreated) {
                // 如果队列中不止一个 RecordBatch,或者最后一个 RecordBatch 满了,或者有创建新的 RecordBatch,则唤醒 Sender 线程发送消息
                log.trace("Waking up the sender since topic {} partition {} is either full or getting a new batch", record.topic(), partition);
            return result.future;
            // handling exceptions and record the errors;
            // for API exceptions return them in the future,
            // for other exceptions throw directly
        } catch (ApiException e) {
            log.debug("Exception occurred during message send:", e);
            if (callback != null)
                callback.onCompletion(null, e);
            this.interceptors.onSendError(record, tp, e);
            return new FutureFailure(e);
        } catch (InterruptedException e) {
            this.interceptors.onSendError(record, tp, e);
            throw new InterruptException(e);
        } catch (BufferExhaustedException e) {
            this.interceptors.onSendError(record, tp, e);
            throw e;
        } catch (KafkaException e) {
            this.interceptors.onSendError(record, tp, e);
            throw e;
        } catch (Exception e) {
            // we notify interceptor about all exceptions, since onSend is called before anything else in this method
            this.interceptors.onSendError(record, tp, e);
            throw e;


        获取集群元数据信息,如果感知到本地缓存的集群元数据信息已经过期,则会通知 Sender 线程进行更新。

private ClusterAndWaitTime waitOnMetadata(String topic, Integer partition, long maxWaitMs) throws InterruptedException {
        // add topic to metadata topic list if it is not there already and reset expiry
        // 获取当前集群信息
        Cluster cluster = metadata.fetch();

        if (cluster.invalidTopics().contains(topic))
            throw new InvalidTopicException(topic);
        // 添加 topic 到集合中,如果是新 topic,标记需要更新集群元数据信息,即把metadata中的 needUpdate 置为true

        Integer partitionsCount = cluster.partitionCountForTopic(topic);
        // Return cached metadata if we have it, and if the record's partition is either undefined
        // or within the known partition range
        // 如果参数未指定分区,或指定的分区在当前记录的分区范围之内,则返回历史集群信息
        if (partitionsCount != null && (partition == null || partition < partitionsCount))
            return new ClusterAndWaitTime(cluster, 0);

        /* 否则,当前缓存的集群元数据信息可能已经过期,需要进行更新 */
        long begin = time.milliseconds();
        long remainingWaitMs = maxWaitMs;
        long elapsed;
        // Issue metadata requests until we have metadata for the topic and the requested partition,
        // or until maxWaitTimeMs is exceeded. This is necessary in case the metadata
        // is stale and the number of partitions for this topic has increased in the meantime.
        do {
            if (partition != null) {
                log.trace("Requesting metadata update for partition {} of topic {}.", partition, topic);
            } else {
                log.trace("Requesting metadata update for topic {}.", topic);
            // 更新 Metadata 的 needUpdate 字段,并获取当前元数据的版本号
            int version = metadata.requestUpdate();
            // 唤醒 sender 线程,由 sender 线程负责更新元数据信息
            try {
                // 等待元数据更新完成
                metadata.awaitUpdate(version, remainingWaitMs);
            } catch (TimeoutException ex) {
                // Rethrow with original maxWaitMs to prevent logging exception with remainingWaitMs
                // 等待超时
                throw new TimeoutException(
                        String.format("Topic %s not present in metadata after %d ms.",
                                topic, maxWaitMs));
            // 获取更新后的集群信息
            cluster = metadata.fetch();
            elapsed = time.milliseconds() - begin;
            if (elapsed >= maxWaitMs) {
                // 等待超时
                throw new TimeoutException(partitionsCount == null ?
                        String.format("Topic %s not present in metadata after %d ms.",
                                topic, maxWaitMs) :
                        String.format("Partition %d of topic %s with partition count %d is not present in metadata after %d ms.",
                                partition, topic, partitionsCount, maxWaitMs));
            // 更新剩余等待时间
            remainingWaitMs = maxWaitMs - elapsed;
            // 获取指定 topic 的分区数目
            partitionsCount = cluster.partitionCountForTopic(topic);
            // // 更新集群信息失败,继续重试,或者更新到底 partition 数目,小于指定要发送的partition数目,则重新更新metadata
        } while (partitionsCount == null || (partition != null && partition >= partitionsCount));

        return new ClusterAndWaitTime(cluster, elapsed);

RecordAccumulator#append 方法

        send方法最终调用 RecordAccumulator#append 方法将消息缓存到收集器 RecordAccumulator 中。

public RecordAppendResult append(TopicPartition tp,
                                     long timestamp,
                                     byte[] key,
                                     byte[] value,
                                     Header[] headers,
                                     Callback callback,
                                     long maxTimeToBlock,
                                     boolean abortOnNewBatch) throws InterruptedException {
        // We keep track of the number of appending thread to make sure we do not miss batches in
        // abortIncompleteBatches().
        // 记录正在向收集器中追加消息的线程数
        ByteBuffer buffer = null;
        if (headers == null) headers = Record.EMPTY_HEADERS;
        try {
            // check if we have an in-progress batch
            // 获取当前 topic 分区对应的 Deque,如果不存在则创建一个
            Deque<ProducerBatch> dq = getOrCreateDeque(tp);
            synchronized (dq) {
                if (closed)
                    // producer 已经被关闭了,抛出异常
                    throw new KafkaException("Producer closed while send in progress");
                // 向 Deque 中最后一个 RecordBatch 追加 Record,并返回对应的 RecordAppendResult 对象
                // 如果追加失败,一般都是因为该 RecordBatch 没有足够的空间足以容纳
                RecordAppendResult appendResult = tryAppend(timestamp, key, value, headers, callback, dq);
                // 追加成功,直接返回
                if (appendResult != null)
                    return appendResult;

            // we don't have an in-progress record batch try to allocate a new batch
            if (abortOnNewBatch) {
                // Return a result that will cause another call to append.
                return new RecordAppendResult(null, false, false, true);
            byte maxUsableMagic = apiVersions.maxUsableProduceMagic();
            /* 追加 Record 失败,尝试申请新的 buffer */
            int size = Math.max(this.batchSize, AbstractRecords.estimateSizeInBytesUpperBound(maxUsableMagic, compression, key, value, headers));
            log.trace("Allocating a new {} byte message buffer for topic {} partition {}", size, tp.topic(), tp.partition());
            // 申请新的 buffer
            buffer = free.allocate(size, maxTimeToBlock);
            synchronized (dq) {
                // Need to check if producer is closed again after grabbing the dequeue lock.
                if (closed)
                    throw new KafkaException("Producer closed while send in progress");
                // 可能中间有空闲的batch出来,所以再次尝试向 Deque 中最后一个 RecordBatch 追加 Record
                RecordAppendResult appendResult = tryAppend(timestamp, key, value, headers, callback, dq);
                if (appendResult != null) {
                    // 追加成功则返回,同时在finally归还之前申请的 buffer
                    // Somebody else found us a batch, return the one we waited for! Hopefully this doesn't happen often...
                    return appendResult;
                /* 仍然追加失败,创建一个新的 RecordBatch 进行追加 */
                MemoryRecordsBuilder recordsBuilder = recordsBuilder(buffer, maxUsableMagic);
                ProducerBatch batch = new ProducerBatch(tp, recordsBuilder, time.milliseconds());
                // 在新创建的 RecordBatch 中追加 Record
                FutureRecordMetadata future = Objects.requireNonNull(batch.tryAppend(timestamp, key, value, headers,
                        callback, time.milliseconds()));

                // 追加到未完成的集合中

                // Don't deallocate this buffer in the finally block as it's being used in the record batch
                buffer = null;
                // 封装成 RecordAppendResult 对象返回
                return new RecordAppendResult(future, dq.size() > 1 || batch.isFull(), true, false);
        } finally {
            // 归还之前申请的 buffer
            if (buffer != null)

上述过程多次调用到 RecordAccumulator#tryAppend 方法,下面来看一下该方法的实现:

private RecordAppendResult tryAppend(long timestamp, byte[] key, byte[] value, Header[] headers,
                                         Callback callback, Deque<ProducerBatch> deque) {
        // 获取 deque 的最后一个 RecordBatch
        ProducerBatch last = deque.peekLast();
        if (last != null) {
            // 尝试往该 RecordBatch 末尾追加消息
            FutureRecordMetadata future = last.tryAppend(timestamp, key, value, headers, callback, time.milliseconds());
            if (future == null)
                // 追加失败
                // 追加成功,将结果封装成 RecordAppendResult 对象返回
                return new RecordAppendResult(future, deque.size() > 1 || last.isFull(), false, false);
        return null;


public FutureRecordMetadata tryAppend(long timestamp, byte[] key, byte[] value, Header[] headers, Callback callback, long now) {
        // 检测是否还有多余的空间容纳该消息
        if (!recordsBuilder.hasRoomFor(timestamp, key, value, headers)) {
            // 没有多余的空间则直接返回,后面会尝试申请新的空间
            return null;
        } else {
            // 添加当前消息到 MemoryRecords,并返回消息对应的 CRC32 校验码
            Long checksum = this.recordsBuilder.append(timestamp, key, value, headers);
            // 更新最大 record 字节数
            this.maxRecordSize = Math.max(this.maxRecordSize, AbstractRecords.estimateSizeInBytesUpperBound(magic(),
                    recordsBuilder.compressionType(), key, value, headers));
            // 更新最后一次追加记录时间戳
            this.lastAppendTime = now;
            FutureRecordMetadata future = new FutureRecordMetadata(this.produceFuture, this.recordCount,
                                                                   timestamp, checksum,
                                                                   key == null ? -1 : key.length,
                                                                   value == null ? -1 : value.length,
            // we have to keep every future returned to the users in case the batch needs to be
            // split to several new batches and resent.
            // 如果指定了 Callback,将 Callback 和 FutureRecordMetadata 封装到 Trunk 中
            thunks.add(new Thunk(callback, future));
            return future;