Elasticsearch gateway模块源码分析
一:介绍
   gateway模块负责集群元数据的存储和集群重启时候元数据的恢复.Elasticsearch的元数据含有集群层元数据、索引层元数据、分片层元数据;
   gateway模块主要负责集群层和索引层的元数据恢复,分片层的元数据是由allocation模块负责;
二:gateway元数据的恢复过程
   当集群完全重启后,达到recovery条件时候,进入元数据恢复流程,其主要实现在GatewayService类中,其源代码过程如下

public void clusterChanged(final ClusterChangedEvent event) {
        if (lifecycle.stoppedOrClosed()) {
            return;
        }
        final ClusterState state = event.state();
        //判断节点是否为主节点,只有主节点才可以进行元数据恢复
        if (state.nodes().isLocalNodeElectedMaster() == false) {
            // not our job to recover
            return;
        }
        //根据STATE_NOT_RECOVERED_BLOCK 判断元数据是否已经恢复好,若恢复好了,就不用恢复了
        if (state.blocks().hasGlobalBlock(STATE_NOT_RECOVERED_BLOCK) == false) {
            // already recovered
            return;
        }

        DiscoveryNodes nodes = state.nodes();
        //判断主节点是否选举
        if (state.nodes().getMasterNodeId() == null) {
            logger.debug("not recovering from gateway, no master elected yet");
        //判断启动节点是否达到设置的节点数,这个是通过设置gateway.expected_nodes这个参数
        } else if (recoverAfterNodes != -1 && (nodes.getMasterAndDataNodes().size()) < recoverAfterNodes) {
            logger.debug("not recovering from gateway, nodes_size (data+master) [{}] < recover_after_nodes [{}]",
                nodes.getMasterAndDataNodes().size(), recoverAfterNodes);
        //判断启动datanode节点是否达到设置个数,此gateway.expected_data_nodes 参数决定
        } else if (recoverAfterDataNodes != -1 && nodes.getDataNodes().size() < recoverAfterDataNodes) {
            logger.debug("not recovering from gateway, nodes_size (data) [{}] < recover_after_data_nodes [{}]",
                nodes.getDataNodes().size(), recoverAfterDataNodes);
        //判断启动masternode节点是否达到设置个数,此gateway.expected_master_nodes 参数决定
        } else if (recoverAfterMasterNodes != -1 && nodes.getMasterNodes().size() < recoverAfterMasterNodes) {
            logger.debug("not recovering from gateway, nodes_size (master) [{}] < recover_after_master_nodes [{}]",
                nodes.getMasterNodes().size(), recoverAfterMasterNodes);
        } else {
            boolean enforceRecoverAfterTime;
            String reason;
            if (expectedNodes == -1 && expectedMasterNodes == -1 && expectedDataNodes == -1) {
                // no expected is set, honor the setting if they are there
                enforceRecoverAfterTime = true;
                reason = "recover_after_time was set to [" + recoverAfterTime + "]";
            } else {
                // one of the expected is set, see if all of them meet the need, and ignore the timeout in this case
                enforceRecoverAfterTime = false;
                reason = "";
                if (expectedNodes != -1 && (nodes.getMasterAndDataNodes().size() < expectedNodes)) { // does not meet the expected...
                    enforceRecoverAfterTime = true;
                    reason = "expecting [" + expectedNodes + "] nodes, but only have [" + nodes.getMasterAndDataNodes().size() + "]";
                } else if (expectedDataNodes != -1 && (nodes.getDataNodes().size() < expectedDataNodes)) { // does not meet the expected...
                    enforceRecoverAfterTime = true;
                    reason = "expecting [" + expectedDataNodes + "] data nodes, but only have [" + nodes.getDataNodes().size() + "]";
                } else if (expectedMasterNodes != -1 && (nodes.getMasterNodes().size() < expectedMasterNodes)) {
                    // does not meet the expected...
                    enforceRecoverAfterTime = true;
                    reason = "expecting [" + expectedMasterNodes + "] master nodes, but only have [" + nodes.getMasterNodes().size() + "]";
                }
            }
            //开始执行恢复
            performStateRecovery(enforceRecoverAfterTime, reason);
        }
    }

最后执行状态恢复的方法在Gateway类的performStateRecovery方法中,其代码实现流程如下

public void performStateRecovery(final GatewayStateRecoveredListener listener) throws GatewayException {
         //获取集群的master的节点
         String[] nodesIds = clusterService.state().nodes().getMasterNodes().keys().toArray(String.class);
         logger.trace("performing state recovery from {}", Arrays.toString(nodesIds));
         //获取各个master节点的状态元数据
         TransportNodesListGatewayMetaState.NodesGatewayMetaState nodesState = listGatewayMetaState.list(nodesIds, null).actionGet();        int requiredAllocation = Math.max(1, minimumMasterNodes);
         if (nodesState.hasFailures()) {
             for (FailedNodeException failedNodeException : nodesState.failures()) {
                 logger.warn("failed to fetch state from node", failedNodeException);
             }
         }        ObjectFloatHashMap<Index> indices = new ObjectFloatHashMap<>();
         //被选举的全局状态变量
         MetaData electedGlobalState = null;
         int found = 0;
         for (TransportNodesListGatewayMetaState.NodeGatewayMetaState nodeState : nodesState.getNodes()) {
             if (nodeState.metaData() == null) {
                 continue;
             }
             found++;
             if (electedGlobalState == null) {
                 electedGlobalState = nodeState.metaData();
             //根据元数据的版本号,把节点最新的元数据版本号赋予全局状态
             } else if (nodeState.metaData().version() > electedGlobalState.version()) {
                 electedGlobalState = nodeState.metaData();
             }
             //把集群的索引信息放在indices
             for (ObjectCursor<IndexMetaData> cursor : nodeState.metaData().indices().values()) {
                 indices.addTo(cursor.value.getIndex(), 1);
             }
         }
         if (found < requiredAllocation) {
             listener.onFailure("found [" + found + "] metadata states, required [" + requiredAllocation + "]");
             return;
         }
         // update the global state, and clean the indices, we elect them in the next phase
         把上面最新的集群全局状态赋予集群状态,并清空索引状态
         MetaData.Builder metaDataBuilder = MetaData.builder(electedGlobalState).removeAllIndices();        assert !indices.containsKey(null);
         //
         final Object[] keys = indices.keys;
         for (int i = 0; i < keys.length; i++) {
             int indexMetaDataCount = 0;
             if (keys[i] != null) {
                 Index index = (Index) keys[i];
                 IndexMetaData electedIndexMetaData = null;
                 //循环每个节点状态,重新获取把每个节点的索引元数据
                 for (TransportNodesListGatewayMetaState.NodeGatewayMetaState nodeState : nodesState.getNodes()) {
                     if (nodeState.metaData() == null) {
                         continue;
                     }
                     //重新获取每个节点的索引元数据
                     IndexMetaData indexMetaData = nodeState.metaData().index(index);
                     if (indexMetaData == null) {
                         continue;
                     }
                     if (electedIndexMetaData == null) {
                         electedIndexMetaData = indexMetaData;
                     //根据索引的版本号,获取新版本好,当作索引状态
                     } else if (indexMetaData.getVersion() > electedIndexMetaData.getVersion()) {
                         electedIndexMetaData = indexMetaData;
                     }
                     indexMetaDataCount++;
                 }
                 if (electedIndexMetaData != null) {
                     if (indexMetaDataCount < requiredAllocation) {
                         logger.debug("[{}] found [{}], required [{}], not adding", index, indexMetaDataCount, requiredAllocation);
                     } // TODO if this logging statement is correct then we are missing an else here
                     try {
                         if (electedIndexMetaData.getState() == IndexMetaData.State.OPEN) {
                             // verify that we can actually create this index - if not we recover it as closed with lots of warn logs
                             indicesService.verifyIndexMetadata(electedIndexMetaData, electedIndexMetaData);
                         }
                     } catch (Exception e) {
                         final Index electedIndex = electedIndexMetaData.getIndex();
                         logger.warn(() -> new ParameterizedMessage("recovering index {} failed - recovering as closed", electedIndex), e);
                         electedIndexMetaData = IndexMetaData.builder(electedIndexMetaData).state(IndexMetaData.State.CLOSE).build();
                     }
                     //更新索引状态
                     metaDataBuilder.put(electedIndexMetaData, false);
                 }
             }
         }
         //更新集群状态
         final ClusterState.Builder builder = upgradeAndArchiveUnknownOrInvalidSettings(metaDataBuilder);
         listener.onSuccess(builder.build());
     }


    
    当上面的两个层次的元数据选举完毕后,调用ClusterService.submitStateUpdateTasks方法,从gateway的最后一步执行retoute,等带集群完成重启后,gateway流程的结束只是
    集群和索引级别元数据已经选举完毕,主分片的选举和路由还要有allocation模块实现。