Elasticsearch gateway模块源码分析
一:介绍
gateway模块负责集群元数据的存储和集群重启时候元数据的恢复.Elasticsearch的元数据含有集群层元数据、索引层元数据、分片层元数据;
gateway模块主要负责集群层和索引层的元数据恢复,分片层的元数据是由allocation模块负责;
二:gateway元数据的恢复过程
当集群完全重启后,达到recovery条件时候,进入元数据恢复流程,其主要实现在GatewayService类中,其源代码过程如下
public void clusterChanged(final ClusterChangedEvent event) {
if (lifecycle.stoppedOrClosed()) {
return;
}
final ClusterState state = event.state();
//判断节点是否为主节点,只有主节点才可以进行元数据恢复
if (state.nodes().isLocalNodeElectedMaster() == false) {
// not our job to recover
return;
}
//根据STATE_NOT_RECOVERED_BLOCK 判断元数据是否已经恢复好,若恢复好了,就不用恢复了
if (state.blocks().hasGlobalBlock(STATE_NOT_RECOVERED_BLOCK) == false) {
// already recovered
return;
}
DiscoveryNodes nodes = state.nodes();
//判断主节点是否选举
if (state.nodes().getMasterNodeId() == null) {
logger.debug("not recovering from gateway, no master elected yet");
//判断启动节点是否达到设置的节点数,这个是通过设置gateway.expected_nodes这个参数
} else if (recoverAfterNodes != -1 && (nodes.getMasterAndDataNodes().size()) < recoverAfterNodes) {
logger.debug("not recovering from gateway, nodes_size (data+master) [{}] < recover_after_nodes [{}]",
nodes.getMasterAndDataNodes().size(), recoverAfterNodes);
//判断启动datanode节点是否达到设置个数,此gateway.expected_data_nodes 参数决定
} else if (recoverAfterDataNodes != -1 && nodes.getDataNodes().size() < recoverAfterDataNodes) {
logger.debug("not recovering from gateway, nodes_size (data) [{}] < recover_after_data_nodes [{}]",
nodes.getDataNodes().size(), recoverAfterDataNodes);
//判断启动masternode节点是否达到设置个数,此gateway.expected_master_nodes 参数决定
} else if (recoverAfterMasterNodes != -1 && nodes.getMasterNodes().size() < recoverAfterMasterNodes) {
logger.debug("not recovering from gateway, nodes_size (master) [{}] < recover_after_master_nodes [{}]",
nodes.getMasterNodes().size(), recoverAfterMasterNodes);
} else {
boolean enforceRecoverAfterTime;
String reason;
if (expectedNodes == -1 && expectedMasterNodes == -1 && expectedDataNodes == -1) {
// no expected is set, honor the setting if they are there
enforceRecoverAfterTime = true;
reason = "recover_after_time was set to [" + recoverAfterTime + "]";
} else {
// one of the expected is set, see if all of them meet the need, and ignore the timeout in this case
enforceRecoverAfterTime = false;
reason = "";
if (expectedNodes != -1 && (nodes.getMasterAndDataNodes().size() < expectedNodes)) { // does not meet the expected...
enforceRecoverAfterTime = true;
reason = "expecting [" + expectedNodes + "] nodes, but only have [" + nodes.getMasterAndDataNodes().size() + "]";
} else if (expectedDataNodes != -1 && (nodes.getDataNodes().size() < expectedDataNodes)) { // does not meet the expected...
enforceRecoverAfterTime = true;
reason = "expecting [" + expectedDataNodes + "] data nodes, but only have [" + nodes.getDataNodes().size() + "]";
} else if (expectedMasterNodes != -1 && (nodes.getMasterNodes().size() < expectedMasterNodes)) {
// does not meet the expected...
enforceRecoverAfterTime = true;
reason = "expecting [" + expectedMasterNodes + "] master nodes, but only have [" + nodes.getMasterNodes().size() + "]";
}
}
//开始执行恢复
performStateRecovery(enforceRecoverAfterTime, reason);
}
}
最后执行状态恢复的方法在Gateway类的performStateRecovery方法中,其代码实现流程如下
public void performStateRecovery(final GatewayStateRecoveredListener listener) throws GatewayException {
//获取集群的master的节点
String[] nodesIds = clusterService.state().nodes().getMasterNodes().keys().toArray(String.class);
logger.trace("performing state recovery from {}", Arrays.toString(nodesIds));
//获取各个master节点的状态元数据
TransportNodesListGatewayMetaState.NodesGatewayMetaState nodesState = listGatewayMetaState.list(nodesIds, null).actionGet(); int requiredAllocation = Math.max(1, minimumMasterNodes);
if (nodesState.hasFailures()) {
for (FailedNodeException failedNodeException : nodesState.failures()) {
logger.warn("failed to fetch state from node", failedNodeException);
}
} ObjectFloatHashMap<Index> indices = new ObjectFloatHashMap<>();
//被选举的全局状态变量
MetaData electedGlobalState = null;
int found = 0;
for (TransportNodesListGatewayMetaState.NodeGatewayMetaState nodeState : nodesState.getNodes()) {
if (nodeState.metaData() == null) {
continue;
}
found++;
if (electedGlobalState == null) {
electedGlobalState = nodeState.metaData();
//根据元数据的版本号,把节点最新的元数据版本号赋予全局状态
} else if (nodeState.metaData().version() > electedGlobalState.version()) {
electedGlobalState = nodeState.metaData();
}
//把集群的索引信息放在indices
for (ObjectCursor<IndexMetaData> cursor : nodeState.metaData().indices().values()) {
indices.addTo(cursor.value.getIndex(), 1);
}
}
if (found < requiredAllocation) {
listener.onFailure("found [" + found + "] metadata states, required [" + requiredAllocation + "]");
return;
}
// update the global state, and clean the indices, we elect them in the next phase
把上面最新的集群全局状态赋予集群状态,并清空索引状态
MetaData.Builder metaDataBuilder = MetaData.builder(electedGlobalState).removeAllIndices(); assert !indices.containsKey(null);
//
final Object[] keys = indices.keys;
for (int i = 0; i < keys.length; i++) {
int indexMetaDataCount = 0;
if (keys[i] != null) {
Index index = (Index) keys[i];
IndexMetaData electedIndexMetaData = null;
//循环每个节点状态,重新获取把每个节点的索引元数据
for (TransportNodesListGatewayMetaState.NodeGatewayMetaState nodeState : nodesState.getNodes()) {
if (nodeState.metaData() == null) {
continue;
}
//重新获取每个节点的索引元数据
IndexMetaData indexMetaData = nodeState.metaData().index(index);
if (indexMetaData == null) {
continue;
}
if (electedIndexMetaData == null) {
electedIndexMetaData = indexMetaData;
//根据索引的版本号,获取新版本好,当作索引状态
} else if (indexMetaData.getVersion() > electedIndexMetaData.getVersion()) {
electedIndexMetaData = indexMetaData;
}
indexMetaDataCount++;
}
if (electedIndexMetaData != null) {
if (indexMetaDataCount < requiredAllocation) {
logger.debug("[{}] found [{}], required [{}], not adding", index, indexMetaDataCount, requiredAllocation);
} // TODO if this logging statement is correct then we are missing an else here
try {
if (electedIndexMetaData.getState() == IndexMetaData.State.OPEN) {
// verify that we can actually create this index - if not we recover it as closed with lots of warn logs
indicesService.verifyIndexMetadata(electedIndexMetaData, electedIndexMetaData);
}
} catch (Exception e) {
final Index electedIndex = electedIndexMetaData.getIndex();
logger.warn(() -> new ParameterizedMessage("recovering index {} failed - recovering as closed", electedIndex), e);
electedIndexMetaData = IndexMetaData.builder(electedIndexMetaData).state(IndexMetaData.State.CLOSE).build();
}
//更新索引状态
metaDataBuilder.put(electedIndexMetaData, false);
}
}
}
//更新集群状态
final ClusterState.Builder builder = upgradeAndArchiveUnknownOrInvalidSettings(metaDataBuilder);
listener.onSuccess(builder.build());
}
当上面的两个层次的元数据选举完毕后,调用ClusterService.submitStateUpdateTasks方法,从gateway的最后一步执行retoute,等带集群完成重启后,gateway流程的结束只是
集群和索引级别元数据已经选举完毕,主分片的选举和路由还要有allocation模块实现。