send接口会把消息经过处理后,放在一个缓存中,由后台sender线程从缓存中取出,然后发送到服务端,这一篇介绍放入缓存的send接口。
了解了 KafkaProducer 的字段定义和对象的构造过程之后,下面正式开始对消息收集的过程进行分析,相关实现位于 KafkaProducer#send 方法中:
public Future<RecordMetadata> send(ProducerRecord<K, V> record, Callback callback) {
// intercept the record, which can be potentially modified; this method does not throw exceptions
// 遍历注册的拦截器对待发送的消息执行拦截修改
ProducerRecord<K, V> interceptedRecord = this.interceptors.onSend(record);
// 调用 doSend 方法开始发送消息
return doSend(interceptedRecord, callback);
}
方法 ProducerInterceptor#doSend 用于收集消息的过程
private Future<RecordMetadata> doSend(ProducerRecord<K, V> record, Callback callback) {
TopicPartition tp = null;
try {
throwIfProducerClosed();
// first make sure the metadata for the topic is available
ClusterAndWaitTime clusterAndWaitTime;
try {
// 1. 获取 kafka 集群元数据信息,如果当前请求的是新 topic,或者指定的分区超过已知的分区范围,则会触发更新集群元数据信息
clusterAndWaitTime = waitOnMetadata(record.topic(), record.partition(), maxBlockTimeMs);
} catch (KafkaException e) {
if (metadata.isClosed())
throw new KafkaException("Producer closed while send in progress", e);
throw e;
}
long remainingWaitMs = Math.max(0, maxBlockTimeMs - clusterAndWaitTime.waitedOnMetadataMs);
Cluster cluster = clusterAndWaitTime.cluster;
// 2 基于注册的序列化器对 key 执行序列化
byte[] serializedKey;
try {
serializedKey = keySerializer.serialize(record.topic(), record.headers(), record.key());
} catch (ClassCastException cce) {
throw new SerializationException("Can't convert key of class " + record.key().getClass().getName() +
" to class " + producerConfig.getClass(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG).getName() +
" specified in key.serializer", cce);
}
// 3. 基于注册的序列化器对 value 执行序列化
byte[] serializedValue;
try {
serializedValue = valueSerializer.serialize(record.topic(), record.headers(), record.value());
} catch (ClassCastException cce) {
throw new SerializationException("Can't convert value of class " + record.value().getClass().getName() +
" to class " + producerConfig.getClass(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG).getName() +
" specified in value.serializer", cce);
}
// 4. 为当前消息选择合适的分区,如果未明确指定的话,则基于注册的分区器为当前消息计算分区
int partition = partition(record, serializedKey, serializedValue, cluster);
// 消息投递的目标 topic 分区
tp = new TopicPartition(record.topic(), partition);
setReadOnly(record.headers());
/* 5. 将消息追加到消息收集器(RecordAccumulator)中 */
Header[] headers = record.headers().toArray();
// 计算当前消息大小,并校验消息是否过大
int serializedSize = AbstractRecords.estimateSizeInBytesUpperBound(apiVersions.maxUsableProduceMagic(),
compressionType, serializedKey, serializedValue, headers);
ensureValidRecordSize(serializedSize);
// 如果未明确为当前消息指定时间戳,则设置为当前时间戳
long timestamp = record.timestamp() == null ? time.milliseconds() : record.timestamp();
if (log.isTraceEnabled()) {
log.trace("Attempting to append record {} with callback {} to topic {} partition {}", record, callback, record.topic(), partition);
}
// producer callback will make sure to call both 'callback' and interceptor callback
Callback interceptCallback = new InterceptorCallback<>(callback, this.interceptors, tp);
if (transactionManager != null && transactionManager.isTransactional()) {
transactionManager.failIfNotReadyForSend();
}
// 追加消息到收集器中
RecordAccumulator.RecordAppendResult result = accumulator.append(tp, timestamp, serializedKey,
serializedValue, headers, interceptCallback, remainingWaitMs, true);
if (result.abortForNewBatch) {
int prevPartition = partition;
partitioner.onNewBatch(record.topic(), cluster, prevPartition);
partition = partition(record, serializedKey, serializedValue, cluster);
tp = new TopicPartition(record.topic(), partition);
if (log.isTraceEnabled()) {
log.trace("Retrying append due to new batch creation for topic {} partition {}. The old partition was {}", record.topic(), partition, prevPartition);
}
// producer callback will make sure to call both 'callback' and interceptor callback
interceptCallback = new InterceptorCallback<>(callback, this.interceptors, tp);
result = accumulator.append(tp, timestamp, serializedKey,
serializedValue, headers, interceptCallback, remainingWaitMs, false);
}
if (transactionManager != null && transactionManager.isTransactional())
transactionManager.maybeAddPartitionToTransaction(tp);
/* 6. 条件性唤醒消息发送线程 */
if (result.batchIsFull || result.newBatchCreated) {
// 如果队列中不止一个 RecordBatch,或者最后一个 RecordBatch 满了,或者有创建新的 RecordBatch,则唤醒 Sender 线程发送消息
log.trace("Waking up the sender since topic {} partition {} is either full or getting a new batch", record.topic(), partition);
this.sender.wakeup();
}
return result.future;
// handling exceptions and record the errors;
// for API exceptions return them in the future,
// for other exceptions throw directly
} catch (ApiException e) {
log.debug("Exception occurred during message send:", e);
if (callback != null)
callback.onCompletion(null, e);
this.errors.record();
this.interceptors.onSendError(record, tp, e);
return new FutureFailure(e);
} catch (InterruptedException e) {
this.errors.record();
this.interceptors.onSendError(record, tp, e);
throw new InterruptException(e);
} catch (BufferExhaustedException e) {
this.errors.record();
this.metrics.sensor("buffer-exhausted-records").record();
this.interceptors.onSendError(record, tp, e);
throw e;
} catch (KafkaException e) {
this.errors.record();
this.interceptors.onSendError(record, tp, e);
throw e;
} catch (Exception e) {
// we notify interceptor about all exceptions, since onSend is called before anything else in this method
this.interceptors.onSendError(record, tp, e);
throw e;
}
}
waitOnMetadata
获取集群元数据信息,如果感知到本地缓存的集群元数据信息已经过期,则会通知 Sender 线程进行更新。
private ClusterAndWaitTime waitOnMetadata(String topic, Integer partition, long maxWaitMs) throws InterruptedException {
// add topic to metadata topic list if it is not there already and reset expiry
// 获取当前集群信息
Cluster cluster = metadata.fetch();
if (cluster.invalidTopics().contains(topic))
throw new InvalidTopicException(topic);
// 添加 topic 到集合中,如果是新 topic,标记需要更新集群元数据信息,即把metadata中的 needUpdate 置为true
metadata.add(topic);
Integer partitionsCount = cluster.partitionCountForTopic(topic);
// Return cached metadata if we have it, and if the record's partition is either undefined
// or within the known partition range
// 如果参数未指定分区,或指定的分区在当前记录的分区范围之内,则返回历史集群信息
if (partitionsCount != null && (partition == null || partition < partitionsCount))
return new ClusterAndWaitTime(cluster, 0);
/* 否则,当前缓存的集群元数据信息可能已经过期,需要进行更新 */
long begin = time.milliseconds();
long remainingWaitMs = maxWaitMs;
long elapsed;
// Issue metadata requests until we have metadata for the topic and the requested partition,
// or until maxWaitTimeMs is exceeded. This is necessary in case the metadata
// is stale and the number of partitions for this topic has increased in the meantime.
do {
if (partition != null) {
log.trace("Requesting metadata update for partition {} of topic {}.", partition, topic);
} else {
log.trace("Requesting metadata update for topic {}.", topic);
}
metadata.add(topic);
// 更新 Metadata 的 needUpdate 字段,并获取当前元数据的版本号
int version = metadata.requestUpdate();
// 唤醒 sender 线程,由 sender 线程负责更新元数据信息
sender.wakeup();
try {
// 等待元数据更新完成
metadata.awaitUpdate(version, remainingWaitMs);
} catch (TimeoutException ex) {
// Rethrow with original maxWaitMs to prevent logging exception with remainingWaitMs
// 等待超时
throw new TimeoutException(
String.format("Topic %s not present in metadata after %d ms.",
topic, maxWaitMs));
}
// 获取更新后的集群信息
cluster = metadata.fetch();
elapsed = time.milliseconds() - begin;
if (elapsed >= maxWaitMs) {
// 等待超时
throw new TimeoutException(partitionsCount == null ?
String.format("Topic %s not present in metadata after %d ms.",
topic, maxWaitMs) :
String.format("Partition %d of topic %s with partition count %d is not present in metadata after %d ms.",
partition, topic, partitionsCount, maxWaitMs));
}
metadata.maybeThrowExceptionForTopic(topic);
// 更新剩余等待时间
remainingWaitMs = maxWaitMs - elapsed;
// 获取指定 topic 的分区数目
partitionsCount = cluster.partitionCountForTopic(topic);
// // 更新集群信息失败,继续重试,或者更新到底 partition 数目,小于指定要发送的partition数目,则重新更新metadata
} while (partitionsCount == null || (partition != null && partition >= partitionsCount));
return new ClusterAndWaitTime(cluster, elapsed);
}
RecordAccumulator#append 方法
send方法最终调用 RecordAccumulator#append 方法将消息缓存到收集器 RecordAccumulator 中。
public RecordAppendResult append(TopicPartition tp,
long timestamp,
byte[] key,
byte[] value,
Header[] headers,
Callback callback,
long maxTimeToBlock,
boolean abortOnNewBatch) throws InterruptedException {
// We keep track of the number of appending thread to make sure we do not miss batches in
// abortIncompleteBatches().
// 记录正在向收集器中追加消息的线程数
appendsInProgress.incrementAndGet();
ByteBuffer buffer = null;
if (headers == null) headers = Record.EMPTY_HEADERS;
try {
// check if we have an in-progress batch
// 获取当前 topic 分区对应的 Deque,如果不存在则创建一个
Deque<ProducerBatch> dq = getOrCreateDeque(tp);
synchronized (dq) {
if (closed)
// producer 已经被关闭了,抛出异常
throw new KafkaException("Producer closed while send in progress");
// 向 Deque 中最后一个 RecordBatch 追加 Record,并返回对应的 RecordAppendResult 对象
// 如果追加失败,一般都是因为该 RecordBatch 没有足够的空间足以容纳
RecordAppendResult appendResult = tryAppend(timestamp, key, value, headers, callback, dq);
// 追加成功,直接返回
if (appendResult != null)
return appendResult;
}
// we don't have an in-progress record batch try to allocate a new batch
if (abortOnNewBatch) {
// Return a result that will cause another call to append.
return new RecordAppendResult(null, false, false, true);
}
byte maxUsableMagic = apiVersions.maxUsableProduceMagic();
/* 追加 Record 失败,尝试申请新的 buffer */
int size = Math.max(this.batchSize, AbstractRecords.estimateSizeInBytesUpperBound(maxUsableMagic, compression, key, value, headers));
log.trace("Allocating a new {} byte message buffer for topic {} partition {}", size, tp.topic(), tp.partition());
// 申请新的 buffer
buffer = free.allocate(size, maxTimeToBlock);
synchronized (dq) {
// Need to check if producer is closed again after grabbing the dequeue lock.
if (closed)
throw new KafkaException("Producer closed while send in progress");
// 可能中间有空闲的batch出来,所以再次尝试向 Deque 中最后一个 RecordBatch 追加 Record
RecordAppendResult appendResult = tryAppend(timestamp, key, value, headers, callback, dq);
if (appendResult != null) {
// 追加成功则返回,同时在finally归还之前申请的 buffer
// Somebody else found us a batch, return the one we waited for! Hopefully this doesn't happen often...
return appendResult;
}
/* 仍然追加失败,创建一个新的 RecordBatch 进行追加 */
MemoryRecordsBuilder recordsBuilder = recordsBuilder(buffer, maxUsableMagic);
ProducerBatch batch = new ProducerBatch(tp, recordsBuilder, time.milliseconds());
// 在新创建的 RecordBatch 中追加 Record
FutureRecordMetadata future = Objects.requireNonNull(batch.tryAppend(timestamp, key, value, headers,
callback, time.milliseconds()));
dq.addLast(batch);
// 追加到未完成的集合中
incomplete.add(batch);
// Don't deallocate this buffer in the finally block as it's being used in the record batch
buffer = null;
// 封装成 RecordAppendResult 对象返回
return new RecordAppendResult(future, dq.size() > 1 || batch.isFull(), true, false);
}
} finally {
// 归还之前申请的 buffer
if (buffer != null)
free.deallocate(buffer);
appendsInProgress.decrementAndGet();
}
}
上述过程多次调用到 RecordAccumulator#tryAppend 方法,下面来看一下该方法的实现:
private RecordAppendResult tryAppend(long timestamp, byte[] key, byte[] value, Header[] headers,
Callback callback, Deque<ProducerBatch> deque) {
// 获取 deque 的最后一个 RecordBatch
ProducerBatch last = deque.peekLast();
if (last != null) {
// 尝试往该 RecordBatch 末尾追加消息
FutureRecordMetadata future = last.tryAppend(timestamp, key, value, headers, callback, time.milliseconds());
if (future == null)
// 追加失败
last.closeForRecordAppends();
else
// 追加成功,将结果封装成 RecordAppendResult 对象返回
return new RecordAppendResult(future, deque.size() > 1 || last.isFull(), false, false);
}
return null;
}
最终调用ProducerBatch的tryAppend
public FutureRecordMetadata tryAppend(long timestamp, byte[] key, byte[] value, Header[] headers, Callback callback, long now) {
// 检测是否还有多余的空间容纳该消息
if (!recordsBuilder.hasRoomFor(timestamp, key, value, headers)) {
// 没有多余的空间则直接返回,后面会尝试申请新的空间
return null;
} else {
// 添加当前消息到 MemoryRecords,并返回消息对应的 CRC32 校验码
Long checksum = this.recordsBuilder.append(timestamp, key, value, headers);
// 更新最大 record 字节数
this.maxRecordSize = Math.max(this.maxRecordSize, AbstractRecords.estimateSizeInBytesUpperBound(magic(),
recordsBuilder.compressionType(), key, value, headers));
// 更新最后一次追加记录时间戳
this.lastAppendTime = now;
FutureRecordMetadata future = new FutureRecordMetadata(this.produceFuture, this.recordCount,
timestamp, checksum,
key == null ? -1 : key.length,
value == null ? -1 : value.length,
Time.SYSTEM);
// we have to keep every future returned to the users in case the batch needs to be
// split to several new batches and resent.
// 如果指定了 Callback,将 Callback 和 FutureRecordMetadata 封装到 Trunk 中
thunks.add(new Thunk(callback, future));
this.recordCount++;
return future;
}
}