nacos源码分析注册流程
上回我们讲解了客户端配置好nacos后,是如何进行注册到服务器的,那我们今天来讲解一下服务器端接收到注册实例请求后会做怎么样的处理。
首先还是把博主画的源码分析图例发一下,让大家对整个流程有一个大概的理解:图示流程地址:https://www.processon.com/view/link/5f7e895be0b34d0711f65178
大家先把nacos服务器端的源码下载下来。在自己本地运行一下,nacos的git地址:https://github.com/alibaba/naco
下载好后,我们还是看一下nacos的官方文档:nacos的各种请求地址url:https://nacos.io/zh-cn/docs/open-api.html,
看源码之前大家应该先了解一下nacos主要具有哪些功能,这样我们看源码的时候才可以顺藤摸瓜,不会被源码绕晕,我们今天主要看nacos最主要的核心功能:
分析-服务发现和服务运行状况检查:Nacos使服务易于注册自己并通过DNS或HTTP接口发现其他服务。Nacos还提供服务的实时运行状况检查,以防止向不正常的主机或服务实例发送请求。
进入正题,我们上节看到了nacos客户端,也就是我们的微服务启动时会进行注册调用服务器的url地址:
1 com.alibaba.nacos.naming.controllers.InstanceController
2 /**
3 * Register new instance.
4 *
5 * @param request http request
6 * @return 'ok' if success
7 * @throws Exception any error during register
8 */
9 @CanDistro
10 @PostMapping
11 @Secured(parser = NamingResourceParser.class, action = ActionTypes.WRITE)
12 public String register(HttpServletRequest request) throws Exception {
13
14 final String namespaceId = WebUtils
15 .optional(request, CommonParams.NAMESPACE_ID, Constants.DEFAULT_NAMESPACE_ID);
16 final String serviceName = WebUtils.required(request, CommonParams.SERVICE_NAME);
17 NamingUtils.checkServiceNameFormat(serviceName);
18 //创建服务实例对象,只是单个对象
19 final Instance instance = parseInstance(request);
20 //开始注册实例
21 serviceManager.registerInstance(namespaceId, serviceName, instance);
22 return "ok";
23 }
这个方法只是单纯的实例对象包装了客户端发送过来的请求信息,具体是哪些参数,可以看看上一节。
1 private Instance parseInstance(HttpServletRequest request) throws Exception {
2
3 String serviceName = WebUtils.required(request, CommonParams.SERVICE_NAME);
4 String app = WebUtils.optional(request, "app", "DEFAULT");
5 Instance instance = getIpAddress(request);
6 instance.setApp(app);
7 instance.setServiceName(serviceName);
8 // Generate simple instance id first. This value would be updated according to
9 // INSTANCE_ID_GENERATOR.
10 instance.setInstanceId(instance.generateInstanceId());
11 instance.setLastBeat(System.currentTimeMillis());
12 String metadata = WebUtils.optional(request, "metadata", StringUtils.EMPTY);
13 if (StringUtils.isNotEmpty(metadata)) {
14 instance.setMetadata(UtilsAndCommons.parseMetadata(metadata));
15 }
16
17 instance.validate();
18
19 return instance;
20 }
我们主要来看看下面的开始注册服务器实例的方法:
1 public void registerInstance(String namespaceId, String serviceName, Instance instance) throws NacosException {
2 //创建服务
3 createEmptyService(namespaceId, serviceName, instance.isEphemeral());
4 //还是一样从map中获取service
5 Service service = getService(namespaceId, serviceName);
6
7 if (service == null) {
8 throw new NacosException(NacosException.INVALID_PARAM,
9 "service not found, namespace: " + namespaceId + ", service: " + serviceName);
10 }
11 //此时服务还是个空壳,没有任何服务器实例,这一步才加入进来
12 addInstance(namespaceId, serviceName, instance.isEphemeral(), instance);
13 }
1
//createEmptyService最后调用的是这里
public void createServiceIfAbsent(String namespaceId, String serviceName, boolean local, Cluster cluster)
2 throws NacosException {
3 //查找是否存在服务,有则返回,无则创建
4 Service service = getService(namespaceId, serviceName);
5 if (service == null) {
6
7 Loggers.SRV_LOG.info("creating empty service {}:{}", namespaceId, serviceName);
8 service = new Service();
9 service.setName(serviceName);
10 service.setNamespaceId(namespaceId);
11 service.setGroupName(NamingUtils.getGroupName(serviceName));
12 // now validate the service. if failed, exception will be thrown
13 service.setLastModifiedMillis(System.currentTimeMillis());
14 service.recalculateChecksum();
15 if (cluster != null) {
16 cluster.setService(service);
17 service.getClusterMap().put(cluster.getName(), cluster);
18 }
19 service.validate();
20 //主要看这个方法:
21 putServiceAndInit(service);
22 if (!local) {
23 addOrReplaceService(service);
24 }
25 }
26 }
1 //这里需要注意这个方法,便于知道nacos是以什么形式存储服务的
2 public Service getService(String namespaceId, String serviceName) {
3 //serviceMap是一个map,这就知道了nacos官方介绍中数据模型中的意思了
4 /**
5 * Map(namespace, Map(group::serviceName, Service)).
6 */
7 //全局是这样定义的:private final Map<String, Map<String, Service>> serviceMap = new ConcurrentHashMap<>();
8 if (serviceMap.get(namespaceId) == null) {
9 return null;
10 }
11 return chooseServiceMap(namespaceId).get(serviceName);
12 }
1 private void putServiceAndInit(Service service) throws NacosException {
2 //把新创建的服务放到map中
3 putService(service);
4 service.init();
5 //添加监听,记住这个,后面有用
6 consistencyService
7 .listen(KeyBuilder.buildInstanceListKey(service.getNamespaceId(), service.getName(), true), service);
8 consistencyService
9 .listen(KeyBuilder.buildInstanceListKey(service.getNamespaceId(), service.getName(), false), service);
10 Loggers.SRV_LOG.info("[NEW-SERVICE] {}", service.toJson());
11 }
1 public void init() {
2 //这个时候开始启动心跳检测的定时任务,可以自行看一下这个任务做了哪些事情:发现超时则置为不健康状态,并调用接口进行删除服务
3 HealthCheckReactor.scheduleCheck(clientBeatCheckTask);
4 for (Map.Entry<String, Cluster> entry : clusterMap.entrySet()) {
5 entry.getValue().setService(this);
6 entry.getValue().init();
7 }
8 }
1 public void addInstance(String namespaceId, String serviceName, boolean ephemeral, Instance... ips)
2 throws NacosException {
3
4 String key = KeyBuilder.buildInstanceListKey(namespaceId, serviceName, ephemeral);
5
6 Service service = getService(namespaceId, serviceName);
7
8 synchronized (service) {
9 //获取所有实例
10 List<Instance> instanceList = addIpAddresses(service, ephemeral, ips);
11
12 Instances instances = new Instances();
13 instances.setInstanceList(instanceList);
14 //注意这里默认的我们的实例都是临时的,都是存储在内存当中,所以找service实现类的时候找DistroConsistencyServiceImpl
15 consistencyService.put(key, instances);
16 }
17 }
1 //addIpAddresses最后会调用该方法
2 public List<Instance> updateIpAddresses(Service service, String action, boolean ephemeral, Instance... ips)
3 throws NacosException {
4
5 Datum datum = consistencyService
6 .get(KeyBuilder.buildInstanceListKey(service.getNamespaceId(), service.getName(), ephemeral));
7
8 List<Instance> currentIPs = service.allIPs(ephemeral);
9 Map<String, Instance> currentInstances = new HashMap<>(currentIPs.size());
10 Set<String> currentInstanceIds = Sets.newHashSet();
11
12 for (Instance instance : currentIPs) {
13 currentInstances.put(instance.toIpAddr(), instance);
14 currentInstanceIds.add(instance.getInstanceId());
15 }
16
17 Map<String, Instance> instanceMap;
18 if (datum != null && null != datum.value) {
19 instanceMap = setValid(((Instances) datum.value).getInstanceList(), currentInstances);
20 } else {
21 instanceMap = new HashMap<>(ips.length);
22 }
23
24 for (Instance instance : ips) {
25 if (!service.getClusterMap().containsKey(instance.getClusterName())) {
26 Cluster cluster = new Cluster(instance.getClusterName(), service);
27 //跟service一样,健康检查定时任务
28 cluster.init();
29 service.getClusterMap().put(instance.getClusterName(), cluster);
30 Loggers.SRV_LOG
31 .warn("cluster: {} not found, ip: {}, will create new cluster with default configuration.",
32 instance.getClusterName(), instance.toJson());
33 }
34 //我们这次进来的action是add,并不是remove,所以不走这里
35 if (UtilsAndCommons.UPDATE_INSTANCE_ACTION_REMOVE.equals(action)) {
36 instanceMap.remove(instance.getDatumKey());
37 } else {
38 //最后就是实例放到map中,以ip+port等信息为key,value为当前实例
39 instance.setInstanceId(instance.generateInstanceId(currentInstanceIds));
40 instanceMap.put(instance.getDatumKey(), instance);
41 }
42
43 }
44
45 if (instanceMap.size() <= 0 && UtilsAndCommons.UPDATE_INSTANCE_ACTION_ADD.equals(action)) {
46 throw new IllegalArgumentException(
47 "ip list can not be empty, service: " + service.getName() + ", ip list: " + JacksonUtils
48 .toJson(instanceMap.values()));
49 }
50
51 return new ArrayList<>(instanceMap.values());
52 }
53
服务也创建完了,健康定时任务也创建完成了,那最后的一步put操作consistencyService.put(key, instances);这个方法具体做了哪些工作呢?我们来看一看。
1 @Override
2 public void put(String key, Record value) throws NacosException {
3 //DistroConsistencyServiceImpl只有这一个实现类是临时实例类
4 mapConsistencyService(key).put(key, value);
5 }
6
7 private ConsistencyService mapConsistencyService(String key) {
8 //通过这个找service的实现类调用put方法
9 return KeyBuilder.matchEphemeralKey(key) ? ephemeralConsistencyService : persistentConsistencyService;
10 }
ephemer参数。所以我们主要看的就是临时实例是如何实现的put方法
该方法具体只做了两件事情:1、添加任务以及添加到内存中;2、同步数据到所有其他节点
1 public void put(String key, Record value) throws NacosException {
2 onPut(key, value);
3 distroProtocol.sync(new DistroKey(key, KeyBuilder.INSTANCE_LIST_KEY_PREFIX), DataOperation.CHANGE,
4 globalConfig.getTaskDispatchPeriod() / 2);
5 }
我们先看一下onput操作:
1 public void onPut(String key, Record value) {
2
3 if (KeyBuilder.matchEphemeralInstanceListKey(key)) {
4 Datum<Instances> datum = new Datum<>();
5 //当前服务的所有实例
6 datum.value = (Instances) value;
7 datum.key = key;
8 datum.timestamp.incrementAndGet();
9 dataStore.put(key, datum);
10 }
11 //上面我说的listen还记得吗?在这里用到了
12 if (!listeners.containsKey(key)) {
13 return;
14 }
15 //添加异步任务
16 notifier.addTask(key, DataOperation.CHANGE);
17 }
这里我们来说一下notifier这个内部类,主要是一个实现了runnable的线程类,内部定义了一个task名称的数组阻塞队列,初始大小为1024*1024,比较大:
1 public class Notifier implements Runnable {
2
3 private ConcurrentHashMap<String, String> services = new ConcurrentHashMap<>(10 * 1024);
4 //task在这里
5 private BlockingQueue<Pair<String, DataOperation>> tasks = new ArrayBlockingQueue<>(1024 * 1024);
6
7 /**
8 * Add new notify task to queue.
9 *
10 * @param datumKey data key
11 * @param action action for data
12 */
13 public void addTask(String datumKey, DataOperation action) {
14
15 if (services.containsKey(datumKey) && action == DataOperation.CHANGE) {
16 return;
17 }
18 if (action == DataOperation.CHANGE) {
19 services.put(datumKey, StringUtils.EMPTY);
20 }
21 //末尾插入队列,pair类为固定大小为2的特殊数组
22 tasks.offer(Pair.with(datumKey, action));
23 }
24
25 .......
26
27 @Override
28 public void run() {
29 Loggers.DISTRO.info("distro notifier started");
30
31 for (; ; ) {
32 try {
33 Pair<String, DataOperation> pair = tasks.take();
34 handle(pair);
35 } catch (Throwable e) {
36 Loggers.DISTRO.error("[NACOS-DISTRO] Error while handling notifying task", e);
37 }
38 }
39 }
40
41 private void handle(Pair<String, DataOperation> pair) {
42 try {
43 String datumKey = pair.getValue0();
44 DataOperation action = pair.getValue1();
45
46 services.remove(datumKey);
47
48 int count = 0;
49
50 if (!listeners.containsKey(datumKey)) {
51 return;
52 }
53
54 for (RecordListener listener : listeners.get(datumKey)) {
55
56 count++;
57
58 try {
59 //我们change操作在这里进行处理,那我们来看看这个方法
60 if (action == DataOperation.CHANGE) {
61 listener.onChange(datumKey, dataStore.get(datumKey).value);
62 continue;
63 }
64
65 if (action == DataOperation.DELETE) {
66 listener.onDelete(datumKey);
67 continue;
68 }
69 } catch (Throwable e) {
70 Loggers.DISTRO.error("[NACOS-DISTRO] error while notifying listener of key: {}", datumKey, e);
71 }
72 }
73
74 if (Loggers.DISTRO.isDebugEnabled()) {
75 Loggers.DISTRO
76 .debug("[NACOS-DISTRO] datum change notified, key: {}, listener count: {}, action: {}",
77 datumKey, count, action.name());
78 }
79 } catch (Throwable e) {
80 Loggers.DISTRO.error("[NACOS-DISTRO] Error while handling notifying task", e);
81 }
82 }
1 public void onChange(String key, Instances value) throws Exception {
2
3 Loggers.SRV_LOG.info("[NACOS-RAFT] datum is changed, key: {}, value: {}", key, value);
4
5 for (Instance instance : value.getInstanceList()) {
6
7 if (instance == null) {
8 // Reject this abnormal instance list:
9 throw new RuntimeException("got null instance " + key);
10 }
11
12 if (instance.getWeight() > 10000.0D) {
13 instance.setWeight(10000.0D);
14 }
15
16 if (instance.getWeight() < 0.01D && instance.getWeight() > 0.0D) {
17 instance.setWeight(0.01D);
18 }
19 }
20 //进入这个方法看看
21 updateIPs(value.getInstanceList(), KeyBuilder.matchEphemeralInstanceListKey(key));
22
23 recalculateChecksum();
24 }
onChange
1 public void updateIPs(Collection<Instance> instances, boolean ephemeral) {
2 Map<String, List<Instance>> ipMap = new HashMap<>(clusterMap.size());
3 for (String clusterName : clusterMap.keySet()) {
4 ipMap.put(clusterName, new ArrayList<>());
5 }
6
7 for (Instance instance : instances) {
8 try {
9 if (instance == null) {
10 Loggers.SRV_LOG.error("[NACOS-DOM] received malformed ip: null");
11 continue;
12 }
13
14 if (StringUtils.isEmpty(instance.getClusterName())) {
15 //如果自己不设置集群的话,会设置为默认名称
16 instance.setClusterName(UtilsAndCommons.DEFAULT_CLUSTER_NAME);
17 }
18
19 if (!clusterMap.containsKey(instance.getClusterName())) {
20 Loggers.SRV_LOG
21 .warn("cluster: {} not found, ip: {}, will create new cluster with default configuration.",
22 instance.getClusterName(), instance.toJson());
23 Cluster cluster = new Cluster(instance.getClusterName(), this);
24 cluster.init();
25 getClusterMap().put(instance.getClusterName(), cluster);
26 }
27 //、取得所有的实例放入map
28 List<Instance> clusterIPs = ipMap.get(instance.getClusterName());
29 if (clusterIPs == null) {
30 clusterIPs = new LinkedList<>();
31 ipMap.put(instance.getClusterName(), clusterIPs);
32 }
33
34 clusterIPs.add(instance);
35 } catch (Exception e) {
36 Loggers.SRV_LOG.error("[NACOS-DOM] failed to process ip: " + instance, e);
37 }
38 }
39
40 for (Map.Entry<String, List<Instance>> entry : ipMap.entrySet()) {
41 //make every ip mine
42 List<Instance> entryIPs = entry.getValue();
43 //主要看这里,上面将实例放入后,进行更新ip操作
44 clusterMap.get(entry.getKey()).updateIps(entryIPs, ephemeral);
45 }
46
47 setLastModifiedMillis(System.currentTimeMillis());
48 getPushService().serviceChanged(this);
49 StringBuilder stringBuilder = new StringBuilder();
50
51 for (Instance instance : allIPs()) {
52 stringBuilder.append(instance.toIpAddr()).append("_").append(instance.isHealthy()).append(",");
53 }
54
55 Loggers.EVT_LOG.info("[IP-UPDATED] namespace: {}, service: {}, ips: {}", getNamespaceId(), getName(),
56 stringBuilder.toString());
57
58 }
那么updateIps到底做了哪些事情呢?我们追踪一下:
1 public void updateIps(List<Instance> ips, boolean ephemeral) {
2
3 Set<Instance> toUpdateInstances = ephemeral ? ephemeralInstances : persistentInstances;
4
5 HashMap<String, Instance> oldIpMap = new HashMap<>(toUpdateInstances.size());
6
7 for (Instance ip : toUpdateInstances) {
8 oldIpMap.put(ip.getDatumKey(), ip);
9 }
10 //updatedIps主要做的是找出oldipmap中的实例并返回
11 List<Instance> updatedIPs = updatedIps(ips, oldIpMap.values());
12 if (updatedIPs.size() > 0) {
13 for (Instance ip : updatedIPs) {
14 Instance oldIP = oldIpMap.get(ip.getDatumKey());
15
16 // do not update the ip validation status of updated ips
17 // because the checker has the most precise result
18 // Only when ip is not marked, don't we update the health status of IP:
19 if (!ip.isMarked()) {
20 ip.setHealthy(oldIP.isHealthy());
21 }
22
23 if (ip.isHealthy() != oldIP.isHealthy()) {
24 // ip validation status updated
25 Loggers.EVT_LOG.info("{} {SYNC} IP-{} {}:{}@{}", getService().getName(),
26 (ip.isHealthy() ? "ENABLED" : "DISABLED"), ip.getIp(), ip.getPort(), getName());
27 }
28
29 if (ip.getWeight() != oldIP.getWeight()) {
30 // ip validation status updated
31 Loggers.EVT_LOG.info("{} {SYNC} {IP-UPDATED} {}->{}", getService().getName(), oldIP.toString(),
32 ip.toString());
33 }
34 }
35 }
36 //找出ips中的实例,oldipmap不存在的实例
37 List<Instance> newIPs = subtract(ips, oldIpMap.values());
38 if (newIPs.size() > 0) {
39 Loggers.EVT_LOG
40 .info("{} {SYNC} {IP-NEW} cluster: {}, new ips size: {}, content: {}", getService().getName(),
41 getName(), newIPs.size(), newIPs.toString());
42
43 for (Instance ip : newIPs) {
44 HealthCheckStatus.reset(ip);
45 }
46 }
47 //找出oldipmap的ip的实例。不存在于ips中的实例
48 List<Instance> deadIPs = subtract(oldIpMap.values(), ips);
49
50 if (deadIPs.size() > 0) {
51 Loggers.EVT_LOG
52 .info("{} {SYNC} {IP-DEAD} cluster: {}, dead ips size: {}, content: {}", getService().getName(),
53 getName(), deadIPs.size(), deadIPs.toString());
54
55 for (Instance ip : deadIPs) {
56 HealthCheckStatus.remv(ip);
57 }
58 }
59
60 toUpdateInstances = new HashSet<>(ips);
61 //进行替换实例
62 if (ephemeral) {
63 ephemeralInstances = toUpdateInstances;
64 } else {
65 persistentInstances = toUpdateInstances;
66 }
67 }
其实总的来说就是,task的目的就是删除没有用的ip更新ip列表。
我们再来看一下第二步做的什么事情:也及时onput的下一个方法sync,通过名字也能猜到应该是同步的工作
1 public void sync(DistroKey distroKey, DataOperation action, long delay) {
2 //allMembersWithoutSelf()方法刨除了本地服务ip
3 for (Member each : memberManager.allMembersWithoutSelf()) {
4 //遍历所有服务ip。进行数据同步
5 DistroKey distroKeyWithTarget = new DistroKey(distroKey.getResourceKey(), distroKey.getResourceType(),
6 each.getAddress());
7 DistroDelayTask distroDelayTask = new DistroDelayTask(distroKeyWithTarget, action, delay);
8 //其实也是一个定时任务
9 distroTaskEngineHolder.getDelayTaskExecuteEngine().addTask(distroKeyWithTarget, distroDelayTask);
10 if (Loggers.DISTRO.isDebugEnabled()) {
11 Loggers.DISTRO.debug("[DISTRO-SCHEDULE] {} to {}", distroKey, each.getAddress());
12 }
13 }
14 }
其实getDelayTaskExcecuteEngine()在创建该类的时候就已经初始化一个定时任务类了,如下图:
1 private class ProcessRunnable implements Runnable {
2
3 @Override
4 public void run() {
5 try {
6 processTasks();
7 } catch (Throwable e) {
8 getEngineLog().error(e.toString(), e);
9 }
10 }
11 }
12
13
14 protected void processTasks() {
15 Collection<Object> keys = getAllTaskKeys();
16 for (Object taskKey : keys) {
17 AbstractDelayTask task = removeTask(taskKey);
18 if (null == task) {
19 continue;
20 }
21 NacosTaskProcessor processor = getProcessor(taskKey);
22 if (null == processor) {
23 getEngineLog().error("processor not found for task, so discarded. " + task);
24 continue;
25 }
26 try {
27 // ReAdd task if process failed
28 if (!processor.process(task)) {
29 retryFailedTask(taskKey, task);
30 }
31 } catch (Throwable e) {
32 getEngineLog().error("Nacos task execute error : " + e.toString(), e);
33 retryFailedTask(taskKey, task);
34 }
35 }
36 }
37
38
39 @Override
40 public boolean process(NacosTask task) {
41 if (!(task instanceof DistroDelayTask)) {
42 return true;
43 }
44 DistroDelayTask distroDelayTask = (DistroDelayTask) task;
45 DistroKey distroKey = distroDelayTask.getDistroKey();
46 if (DataOperation.CHANGE.equals(distroDelayTask.getAction())) {
47 //又创建了一个定时任务
48 DistroSyncChangeTask syncChangeTask = new DistroSyncChangeTask(distroKey, distroComponentHolder);
49 distroTaskEngineHolder.getExecuteWorkersManager().addTask(distroKey, syncChangeTask);
50 return true;
51 }
52 return false;
53 }
54
55 //DistroSyncChangeTask任务执行方法是这个
56 @Override
57 public void run() {
58 Loggers.DISTRO.info("[DISTRO-START] {}", toString());
59 try {
60 String type = getDistroKey().getResourceType();
61 DistroData distroData = distroComponentHolder.findDataStorage(type).getDistroData(getDistroKey());
62 distroData.setType(DataOperation.CHANGE);
63 //这里就不仅进去看了,是调用接口地址,大家自己可以看看
64 boolean result = distroComponentHolder.findTransportAgent(type).syncData(distroData, getDistroKey().getTargetServer());
65 if (!result) {
66 handleFailedTask();
67 }
68 Loggers.DISTRO.info("[DISTRO-END] {} result: {}", toString(), result);
69 } catch (Exception e) {
70 Loggers.DISTRO.warn("[DISTRO] Sync data change failed.", e);
71 handleFailedTask();
72 }
73 }
好了,到这里已经解决完nacos服务器端注册的流程了,大家可以自己去看看源码,另外,这是阿里开源的,但是有些地方也没有特别遵守阿里开发规范,比如全局定时任务类:
GlobalExecutor,具体哪里没有遵守,大家可以看看,反正也没几条关于线程池的规范。