Elasticsearch 作为一个分布式索引,目前在各个公司应用场景还是很多的,下面我们就对索引的写入流程,做一个简单的介绍。 我们以一个批量写入索引为例(Es版本7.10.2)进行说明,先客户端使用下面的一个请求,向ES发送请求
PUT /test/_bulk
{"index":{"_id":"11"}}
{"name": "John Doe","age":23,"bir":"2012-12-12"}
{"index":{"_id":"12"}}
{"name": "Jane Doe","age":23,"bir":"2012-12-12"}
这个请求会被 org.elasticsearch.rest.RestController类中的dispatchRequest方法接收;
//流程 1-1方法 分发客户端的写入请求
public void dispatchRequest(RestRequest request, RestChannel channel, ThreadContext threadContext) {
try {
//主要流程1-2
tryAllHandlers(request, channel, threadContext);
} catch (Exception e) {
try {
channel.sendResponse(new BytesRestResponse(channel, e));
} catch (Exception inner) {
inner.addSuppressed(e);
logger.error(() ->
new ParameterizedMessage("failed to send failure response for uri [{}]", request.uri()), inner);
}
}
}
//流程1-2 方法
private void tryAllHandlers(final RestRequest request, final RestChannel channel, final
ThreadContext threadContext) throws Exception {
//上面做一系统请求验证,若请求正常走主流程 1-3
dispatchRequest(request, channel, handler);
}
//流程1-3 方法
private void dispatchRequest(RestRequest request, RestChannel channel, RestHandler handler) throws Exception {
流程 1-4
handler.handleRequest(request, responseChannel, client);
}
如上面代码流程所示,请求经过验证,比如请求头,请求体等验证后,会分发给RestHandler接口中的 handleReques方法去处理,跟踪源码后,你会发现这个方法最终落到一个实RestHandler接口的一个抽象类org.elasticsearch.rest.BaseRestHandler中的 handleRequest方法中,其方式
// 流程1-4 方法
public final void handleRequest(RestRequest request, RestChannel channel, NodeClient client) throws Exception {
// prepare the request for execution; has the side effect of touching the request parameters
//流程 1-5
final RestChannelConsumer action = prepareRequest(request, client);
}
经过上面1-5 的prepareRequest(request, client) 方法,会经过org.elasticsearch.rest.action.document.RestBulkAction类的prepareRequest方法对请求进行封装
// 流程 1-5 方法 组装请求
public RestChannelConsumer prepareRequest(final RestRequest request, final NodeClient client) throws IOException {
BulkRequest bulkRequest = Requests.bulkRequest();
String defaultIndex = request.param("index");
String defaultType = request.param("type");
if (defaultType == null) {
defaultType = MapperService.SINGLE_MAPPING_NAME;
} else {
deprecationLogger.deprecate("bulk_with_types", RestBulkAction.TYPES_DEPRECATION_MESSAGE);
}
String defaultRouting = request.param("routing");
FetchSourceContext defaultFetchSourceContext = FetchSourceContext.parseFromRestRequest(request);
String defaultPipeline = request.param("pipeline");
String waitForActiveShards = request.param("wait_for_active_shards");
if (waitForActiveShards != null) {
bulkRequest.waitForActiveShards(ActiveShardCount.parseString(waitForActiveShards));
}
Boolean defaultRequireAlias = request.paramAsBoolean(DocWriteRequest.REQUIRE_ALIAS, null);
bulkRequest.timeout(request.paramAsTime("timeout", BulkShardRequest.DEFAULT_TIMEOUT));
bulkRequest.setRefreshPolicy(request.param("refresh"));
bulkRequest.add(request.requiredContent(), defaultIndex, defaultType, defaultRouting,
defaultFetchSourceContext, defaultPipeline, defaultRequireAlias, allowExplicitIndex, request.getXContentType());
// 1-6 客户端转发请求
return channel -> client.bulk(bulkRequest, new RestStatusToXContentListener<>(channel));
}
请求到客户端后,执行到org.elasticsearch.client.support.AbstractClient类中的doExecute方法,其实现类org.elasticsearch.client.node.nodeClient 的执行方法
// 流程1-6方法
public void bulk(final BulkRequest request, final ActionListener<BulkResponse> listener) {
// 1-7方法
execute(BulkAction.INSTANCE, request, listener);
}
方法1-7
public final <Request extends ActionRequest, Response extends ActionResponse> void execute(
ActionType<Response> action, Request request, ActionListener<Response> listener) {
listener = threadedWrapper.wrap(listener);
1-8
doExecute(action, request, listener);
}
// 流程 1-8 NodeClient类中的executeLocally()方法
public <Request extends ActionRequest, Response extends ActionResponse>
void doExecute(ActionType<Response> action, Request request, ActionListener<Response> listener) {
// 1-9
executeLocally(action, request, listener);
}
// 流程1-9
public <Request extends ActionRequest,Response extends ActionResponse
> Task executeLocally(ActionType<Response> action, Request request, ActionListener<Response> listener) {
流程1-10
return transportAction(action).execute(request, listener);
}
流程走到org.elasticsearch.action.support.TransportAction类的execute方法后,其最终会到
TransportBulkAction类doExecute方法和doInternalExecute方法,其会判断是否要创建索引,获取集群元数据和状态,最后生成一个BulkOperation
流程 1-10
protected void doExecute(Task task, BulkRequest bulkRequest, ActionListener<BulkResponse> listener) {
final long indexingBytes = bulkRequest.ramBytesUsed();
final boolean isOnlySystem = isOnlySystem(bulkRequest, clusterService.state().metadata().getIndicesLookup(), systemIndices);
final Releasable releasable = indexingPressure.markCoordinatingOperationStarted(indexingBytes, isOnlySystem);
final ActionListener<BulkResponse> releasingListener = ActionListener.runBefore(listener, releasable::close);
final String executorName = isOnlySystem ? Names.SYSTEM_WRITE : Names.WRITE;
try {
1-11
doInternalExecute(task, bulkRequest, executorName, releasingListener);
} catch (Exception e) {
releasingListener.onFailure(e);
}
}
//流程 1-11 方法
protected void doInternalExecute(Task task, BulkRequest bulkRequest, String executorName,
ActionListener<BulkResponse> listener) {
//获取集群元数据
final Metadata metadata = clusterService.state().getMetadata();
final Version minNodeVersion =
clusterService.state().getNodes().getMinNodeVersion();
// 判断是否要自动创建索引
Set<String> autoCreateIndices = new HashSet<>();
// 集群状态
ClusterState state = clusterService.state();
if (autoCreateIndices.isEmpty()) {
// 流程 1-12 方法
executeBulk(task, bulkRequest, startTime, listener, responses, indicesThatCannotBeCreated);
}
}
// 流程1-12 方法
void executeBulk(Task task, final BulkRequest bulkRequest, final long startTimeNanos, final ActionListener<BulkResponse> listener,
final AtomicArray<BulkItemResponse> responses, Map<String, IndexNotFoundException> indicesThatCannotBeCreated) {
流程 1-13
new BulkOperation(task, bulkRequest, listener, responses, startTimeNanos, indicesThatCannotBeCreated).run();
}
如上面代码最终后成为一个TransportBulkAction 的内部类BulkOpration的执行doRun方法,其方法源码如下;
流程1-13 方法
protected void doRun() {
//判断请求类型
switch (docWriteRequest.opType()) {
case CREATE:
case INDEX:
case UPDATE:
case DELETE:
}
Map<ShardId, List<BulkItemRequest>> requestsByShard = new HashMap<>();
for (int i = 0; i < bulkRequest.requests.size(); i++) {
DocWriteRequest<?> request = bulkRequest.requests.get(i);
if (request == null) {
continue;
}
String concreteIndex = concreteIndices.getConcreteIndex(request.index()).getName();
ShardId shardId = clusterService.operationRouting().indexShards(clusterState, concreteIndex, request.id(),
request.routing()).shardId();
List<BulkItemRequest> shardRequests = requestsByShard.computeIfAbsent(shardId, shard -> new ArrayList<>());
shardRequests.add(new BulkItemRequest(i, request));
}
String nodeId = clusterService.localNode().getId();
for (Map.Entry<ShardId, List<BulkItemRequest>> entry : requestsByShard.entrySet()) {
final ShardId shardId = entry.getKey();
final List<BulkItemRequest> requests = entry.getValue();
BulkShardRequest bulkShardRequest = new BulkShardRequest(shardId, bulkRequest.getRefreshPolicy(),
requests.toArray(new BulkItemRequest[requests.size()]));
bulkShardRequest.waitForActiveShards(bulkRequest.waitForActiveShards());
bulkShardRequest.timeout(bulkRequest.timeout());
bulkShardRequest.routedBasedOnClusterVersion(clusterState.version());
if (task != null) {
bulkShardRequest.setParentTask(nodeId, task.getId());
}
// 1-14 方法
shardBulkAction.execute(bulkShardRequest, new ActionListener<BulkShardResponse>() {
@Override
public void onResponse(BulkShardResponse bulkShardResponse) {
for (BulkItemResponse bulkItemResponse : bulkShardResponse.getResponses()) {
// we may have no response if item failed
if (bulkItemResponse.getResponse() != null) {
bulkItemResponse.getResponse().setShardInfo(bulkShardResponse.getShardInfo());
}
responses.set(bulkItemResponse.getItemId(), bulkItemResponse);
}
if (counter.decrementAndGet() == 0) {
finishHim();
}
}
}
上面代码主要流程可以概括如下,第一就是判断请求类型,然后把请求按分片进行分组,将用户的bulKRequest组织为基于shard 的请求,例如客户有10个写请求操作,如果这10个请求的都落到同个 分片上,那就会合并成一个shard 请求,并保持在一个map<shardId, List<bulkItemRequest>> 中。请求合并是根据路由算法得出哪个请求的文档落在哪个主分片中,路由算法是 shard_num = (hash(_routing) + hash(_id) % routing_partition_size) % num_primary_shards 请把shard的请求给到TransportShardBulkAction类;最后调用 TransportShardBulkAction的executor 执行发送,在listener中等待响应,每个响应也是以shard为单位的。转发请求的具体实现最后位于TransportReplicationAction.ReroutePhase#doRun
2-1 方法
protected void doExecute(Task task, Request request, ActionListener<Response> listener) {
assert request.shardId() != null : "request shardId must be set";
// 2-2 方法
runReroutePhase(task, request, listener, true);
}
方法 2-2
private void runReroutePhase(Task task, Request request, ActionListener<Response> listener, boolean initiatedByNodeClient) {
try {
2-3 方法
new ReroutePhase((ReplicationTask) task, request, listener, initiatedByNodeClient).run();
} catch (RuntimeException e) {
listener.onFailure(e);
}
}
当执行到2-3方法后,会执行ReroutePhase#doRun()方法,在其方法中首先更改状态,获取集群状态信息;然后根据final ShardRouting primary = state.getRoutingTable().shardRoutingTable(request.shardId()).primaryShard()方法,以shardId 获取主分片信息的路由信息,然后再根据主分片的信息找到主分片的节点,并判断节点的是否为请求的本地节点,若是本地节点走2-4-1方法,若不是走2-4-2 方法;
2-3 方法关键代码走读
protected void doRun() {
setPhase(task, "routing");
final ClusterState state = observer.setAndGetObservedState();
final ClusterBlockException blockException = blockExceptions(state, request.shardId().getIndexName());
final ShardRouting primary = state.getRoutingTable().shardRoutingTable(request.shardId()).primaryShard();
final DiscoveryNode node = state.nodes().get(primary.currentNodeId());
if (primary.currentNodeId().equals(state.nodes().getLocalNodeId())) {
// 2-4-1
performLocalAction(state, primary, node, indexMetadata);
} else {
// 2-4-2
performRemoteAction(state, primary, node);
}
}
下面以2-4-1 方法为例进行说明
2-4-1 方法
private void performLocalAction(ClusterState state, ShardRouting primary, DiscoveryNode node, IndexMetadata indexMetadata) {
setPhase(task, "waiting_on_primary");
if (logger.isTraceEnabled()) {
logger.trace("send action [{}] to local primary [{}] for request [{}] with cluster state version [{}] to [{}] ",
transportPrimaryAction, request.shardId(), request, state.version(), primary.currentNodeId());
}
2-5 方法
performAction(node, transportPrimaryAction, true,
new ConcreteShardRequest<>(request, primary.allocationId().getId(), indexMetadata.primaryTerm(primary.id()), true,
initiatedByNodeClient));
}
2-5 方法
private void performAction(final DiscoveryNode node, final String action, final boolean isPrimaryAction,
final TransportRequest requestToPerform) {
//2-6 经RPC把请求发送
transportService.sendRequest(node, action, requestToPerform, transportOptions, new TransportResponseHandler<Response>()
}
// 2-7 接受发送的请求
protected void handlePrimaryRequest(final ConcreteShardRequest<Request> request, final TransportChannel channel, final Task task) {
Releasable releasable = checkPrimaryLimits(request.getRequest(), request.sentFromLocalReroute(),
request.localRerouteInitiatedByNodeClient());
ActionListener<Response> listener =
ActionListener.runBefore(new ChannelActionListener<>(channel, transportPrimaryAction, request), releasable::close);
try {
//方法 2-8
new AsyncPrimaryAction(request, listener, (ReplicationTask) task).run();
} catch (RuntimeException e) {
listener.onFailure(e);
}
}
如上源码所示,请求会经过transportService#sendRequest方法把请求发送出来;并经过handlePrimaryRequest进行接收到,给我内部类AsyncPrimaryAction的run方法
// 2-9 方法
protected void doRun() throws Exception {
final ShardId shardId = primaryRequest.getRequest().shardId();
final IndexShard indexShard = getIndexShard(shardId);
final ShardRouting shardRouting = indexShard.routingEntry();
// we may end up here if the cluster state used to route the primary is so stale that the underlying
// index shard was replaced with a replica. For example - in a two node cluster, if the primary fails
// the replica will take over and a replica will be assigned to the first node.
if (shardRouting.primary() == false) {
throw new ReplicationOperation.RetryOnPrimaryException(shardId, "actual shard is not a primary " + shardRouting);
}
final String actualAllocationId = shardRouting.allocationId().getId();
if (actualAllocationId.equals(primaryRequest.getTargetAllocationID()) == false) {
throw new ShardNotFoundException(shardId, "expected allocation id [{}] but found [{}]",
primaryRequest.getTargetAllocationID(), actualAllocationId);
}
final long actualTerm = indexShard.getPendingPrimaryTerm();
if (actualTerm != primaryRequest.getPrimaryTerm()) {
throw new ShardNotFoundException(shardId, "expected allocation id [{}] with term [{}] but found [{}]",
primaryRequest.getTargetAllocationID(), primaryRequest.getPrimaryTerm(), actualTerm);
}
//2-10 方法
acquirePrimaryOperationPermit(
indexShard,
primaryRequest.getRequest(),
ActionListener.wrap(
releasable -> runWithPrimaryShardReference(new PrimaryShardReference(indexShard, releasable)),
e -> {
if (e instanceof ShardNotInPrimaryModeException) {
onFailure(new ReplicationOperation.RetryOnPrimaryException(shardId, "shard is not in primary mode", e));
} else {
onFailure(e);
}
}));
}
// 2-10 方法
protected void acquirePrimaryOperationPermit(final IndexShard primary,
final Request request,
final ActionListener<Releasable> onAcquired) {
//2-11方法
primary.acquirePrimaryOperationPermit(onAcquired, executor, request, forceExecutionOnPrimary);
}
最后请求会落到IndexShard#acquirePrimaryOperationPermit 方法中后跟进到applyIndexOperationOnPrimary 方法
2-11
public void acquirePrimaryOperationPermit(ActionListener<Releasable> onPermitAcquired, String executorOnDelay, Object debugInfo,
boolean forceExecution) {
verifyNotClosed();
assert shardRouting.primary() : "acquirePrimaryOperationPermit should only be called on primary shard: " + shardRouting;
indexShardOperationPermits.acquire(wrapPrimaryOperationPermitListener(onPermitAcquired), executorOnDelay, forceExecution,
debugInfo);
}
2-12
public Engine.IndexResult applyIndexOperationOnPrimary(long version, VersionType versionType, SourceToParse sourceToParse,
long ifSeqNo, long ifPrimaryTerm, long autoGeneratedTimestamp,
boolean isRetry)
throws IOException {
assert versionType.validateVersionForWrites(version);
方法2-13
return applyIndexOperation(getEngine(), UNASSIGNED_SEQ_NO, getOperationPrimaryTerm(), version, versionType, ifSeqNo,
ifPrimaryTerm, autoGeneratedTimestamp, isRetry, Engine.Operation.Origin.PRIMARY, sourceToParse);
}
后面跟进到applyIndexOperation方法中,其首先对主分片的parimayTerm进行校验,
//方法2-13
private Engine.IndexResult applyIndexOperation(Engine engine, long seqNo, long opPrimaryTerm, long version,
@Nullable VersionType versionType, long ifSeqNo, long ifPrimaryTerm,
long autoGeneratedTimeStamp, boolean isRetry, Engine.Operation.Origin origin,
SourceToParse sourceToParse) throws IOException {
assert opPrimaryTerm <= getOperationPrimaryTerm()
: "op term [ " + opPrimaryTerm + " ] > shard term [" + getOperationPrimaryTerm() + "]";
ensureWriteAllowed(origin);
Engine.Index operation;
try {
final String resolvedType = mapperService.resolveDocumentType(sourceToParse.type());
final SourceToParse sourceWithResolvedType;
if (resolvedType.equals(sourceToParse.type())) {
sourceWithResolvedType = sourceToParse;
} else {
sourceWithResolvedType = new SourceToParse(sourceToParse.index(), resolvedType, sourceToParse.id(),
sourceToParse.source(), sourceToParse.getXContentType(), sourceToParse.routing());
}
operation = prepareIndex(docMapper(resolvedType), sourceWithResolvedType,
seqNo, opPrimaryTerm, version, versionType, origin, autoGeneratedTimeStamp, isRetry, ifSeqNo, ifPrimaryTerm);
Mapping update = operation.parsedDoc().dynamicMappingsUpdate();
if (update != null) {
return new Engine.IndexResult(update);
}
} catch (Exception e) {
// We treat any exception during parsing and or mapping update as a document level failure
// with the exception side effects of closing the shard. Since we don't have the shard, we
// can not raise an exception that may block any replication of previous operations to the
// replicas
verifyNotClosed(e);
return new Engine.IndexResult(e, version, opPrimaryTerm, seqNo);
}
//方法2-14
return index(engine, operation);
}
// 方法 2-14
private Engine.IndexResult index(Engine engine, Engine.Index index) throws IOException {
active.set(true);
final Engine.IndexResult result;
index = indexingOperationListeners.preIndex(shardId, index);
try {
if (logger.isTraceEnabled()) {
// don't use index.source().utf8ToString() here source might not be valid UTF-8
logger.trace("index [{}][{}] seq# [{}] allocation-id [{}] primaryTerm [{}] operationPrimaryTerm [{}] origin [{}]",
index.type(), index.id(), index.seqNo(), routingEntry().allocationId(), index.primaryTerm(), getOperationPrimaryTerm(),
index.origin());
}
result = engine.index(index);
if (logger.isTraceEnabled()) {
logger.trace("index-done [{}][{}] seq# [{}] allocation-id [{}] primaryTerm [{}] operationPrimaryTerm [{}] origin [{}] " +
"result-seq# [{}] result-term [{}] failure [{}]",
index.type(), index.id(), index.seqNo(), routingEntry().allocationId(), index.primaryTerm(), getOperationPrimaryTerm(),
index.origin(), result.getSeqNo(), result.getTerm(), result.getFailure());
}
} catch (Exception e) {
if (logger.isTraceEnabled()) {
logger.trace(new ParameterizedMessage(
"index-fail [{}][{}] seq# [{}] allocation-id [{}] primaryTerm [{}] operationPrimaryTerm [{}] origin [{}]",
index.type(), index.id(), index.seqNo(), routingEntry().allocationId(), index.primaryTerm(), getOperationPrimaryTerm(),
index.origin()
), e);
}
indexingOperationListeners.postIndex(shardId, index, e);
throw e;
}
indexingOperationListeners.postIndex(shardId, index, result);
return result;
}
最后经过Engine#index 方法写入