文章目录
- AlarmModuleProvider负责初始化alarmRulesWatcher和notifyHandler
- alarmRulesWatcher包含告警规则配置和上报的数据[滑动窗口存储]
- notifyHandler在agent进行metrics上报被调用,并将数据写入alarmRulesWatcher的滑动窗口
- notifyHandler内部的AlarmCore会开启异步线程每10秒执行一次告警任务
- prepare中读取alarm-settings.yml告警规则配置
- prepare中初始化alarmRulesWatcher[包含告警规则和数据]
- prepare中初始化(数据进入告警模块处理的入口)notifyHandler
- prepare中为notifyHandler添加告警消息的持久化回调以及启动异步线程开始定时执行告警检测
- start注册alarmRulesWatcher到配置中心,alarmRulesWatcher.notify可以看出其支持告警规则的动态修改
public class AlarmModuleProvider extends ModuleProvider {
private NotifyHandler notifyHandler;
private AlarmRulesWatcher alarmRulesWatcher;
@Override public void prepare() throws ServiceNotProvidedException, ModuleStartException {
Reader applicationReader;
try {
读取告警规则配置
applicationReader = ResourceUtils.read("alarm-settings.yml");
} catch (FileNotFoundException e) {
throw new ModuleStartException("can't load alarm-settings.yml", e);
}
RulesReader reader = new RulesReader(applicationReader);
包含告警规则AlarmRule以及 告警通知地址webhooks【这里通知比较简单,生产中可以支持短信,电话,工作群,邮件,第三方软件等】
Rules rules = reader.readRules();
alarmRulesWatcher[包含告警规则和数据]
alarmRulesWatcher = new AlarmRulesWatcher(rules, this);
数据进入告警模块处理的入口
notifyHandler = new NotifyHandler(alarmRulesWatcher);
添加告警消息的持久化回调 以及启动异步线程开始工作
notifyHandler.init(new AlarmStandardPersistence());
this.registerServiceImplementation(MetricsNotify.class, notifyHandler);
}
@Override public void start() throws ServiceNotProvidedException, ModuleStartException {
DynamicConfigurationService dynamicConfigurationService = getManager().find(ConfigurationModule.NAME).provider().getService(DynamicConfigurationService.class);
/注册alarmRulesWatcher到配置中心,alarmRulesWatcher.notify可以看出其支持告警规则的动态修改
dynamicConfigurationService.registerConfigChangeWatcher(alarmRulesWatcher);
}
}
源码分析一NotifyHandler
- 由MetricsStreamProcessor触发NotifyHandler调用完成指标数据写入
- AlarmCore负责定时调度数据
public class NotifyHandler implements MetricsNotify {
private ServiceInventoryCache serviceInventoryCache;
private ServiceInstanceInventoryCache serviceInstanceInventoryCache;
private EndpointInventoryCache endpointInventoryCache;
private final AlarmCore core;
private final AlarmRulesWatcher alarmRulesWatcher;
public NotifyHandler(AlarmRulesWatcher alarmRulesWatcher) {
this.alarmRulesWatcher = alarmRulesWatcher;
core = new AlarmCore(alarmRulesWatcher);
}
检测指标是否告警的入口minutPersistentWorker 执行day month hour三个 PersistentWorker不处理告警
由MetricsStreamProcessor.AlarmNotifyWorker.AlarmEntrance=>NotifyHandler调用
@Override public void notify(Metrics metrics) {
WithMetadata withMetadata = (WithMetadata)metrics;
MetricsMetaInfo meta = withMetadata.getMeta();
int scope = meta.getScope();
if (!DefaultScopeDefine.inServiceCatalog(scope)
&& !DefaultScopeDefine.inServiceInstanceCatalog(scope)
&& !DefaultScopeDefine.inEndpointCatalog(scope)) {
return;
}
MetaInAlarm metaInAlarm;
if (DefaultScopeDefine.inServiceCatalog(scope)) {
int serviceId = Integer.parseInt(meta.getId());
ServiceInventory serviceInventory = serviceInventoryCache.get(serviceId);
ServiceMetaInAlarm serviceMetaInAlarm = new ServiceMetaInAlarm();
serviceMetaInAlarm.setMetricsName(meta.getMetricsName());
serviceMetaInAlarm.setId(serviceId);
serviceMetaInAlarm.setName(serviceInventory.getName());
metaInAlarm = serviceMetaInAlarm;
} else if (DefaultScopeDefine.inServiceInstanceCatalog(scope)) {
int serviceInstanceId = Integer.parseInt(meta.getId());
ServiceInstanceInventory serviceInstanceInventory = serviceInstanceInventoryCache.get(serviceInstanceId);
ServiceInstanceMetaInAlarm instanceMetaInAlarm = new ServiceInstanceMetaInAlarm();
instanceMetaInAlarm.setMetricsName(meta.getMetricsName());
instanceMetaInAlarm.setId(serviceInstanceId);
instanceMetaInAlarm.setName(serviceInstanceInventory.getName());
metaInAlarm = instanceMetaInAlarm;
} else if (DefaultScopeDefine.inEndpointCatalog(scope)) {
int endpointId = Integer.parseInt(meta.getId());
EndpointInventory endpointInventory = endpointInventoryCache.get(endpointId);
EndpointMetaInAlarm endpointMetaInAlarm = new EndpointMetaInAlarm();
endpointMetaInAlarm.setMetricsName(meta.getMetricsName());
endpointMetaInAlarm.setId(endpointId);
int serviceId = endpointInventory.getServiceId();
ServiceInventory serviceInventory = serviceInventoryCache.get(serviceId);
String textName = endpointInventory.getName() + " in " + serviceInventory.getName();
endpointMetaInAlarm.setName(textName);
metaInAlarm = endpointMetaInAlarm;
} else {
return;
}
查询指标对应的规则
List<RunningRule> runningRules = core.findRunningRule(meta.getMetricsName());
if (runningRules == null) {
return;
}
运行时规则添加metrics数据
runningRules.forEach(rule -> rule.in(metaInAlarm, metrics));
}
public void init(AlarmCallback... callbacks) {
告警回调是一个链表模式 先持久化 在WebhookCallback进行通知
List<AlarmCallback> allCallbacks = new ArrayList<>(Arrays.asList(callbacks));
allCallbacks.add(new WebhookCallback(alarmRulesWatcher/* 提供web_hook 地址*/));
启动异步告警线程每10秒进行一次告警检测
core.start(allCallbacks);
}
public void initCache(ModuleManager moduleManager) {
serviceInventoryCache = moduleManager.find(CoreModule.NAME).provider().getService(ServiceInventoryCache.class);
serviceInstanceInventoryCache = moduleManager.find(CoreModule.NAME).provider().getService(ServiceInstanceInventoryCache.class);
endpointInventoryCache = moduleManager.find(CoreModule.NAME).provider().getService(EndpointInventoryCache.class);
}
}
源码分析一AlarmCore.start
静态规则包含规则配置,运行时规则包含规则和数据
- 通过alarmRulesWatcher获取所有静态规则[AlarmRule]的运行时规则[RunningRule]
- 通过runningRule.moveTo更新时间窗口
- 通过runningRule.check进行告警检测
- 存在告警则调用callback发送webhook通知
public void start(List<AlarmCallback> allCallbacks) {
LocalDateTime now = LocalDateTime.now();
lastExecuteTime = now;
Executors.newSingleThreadScheduledExecutor().scheduleAtFixedRate(() -> {
try {
List<AlarmMessage> alarmMessageList = new ArrayList<>(30);
LocalDateTime checkTime = LocalDateTime.now();
计算当前时间与上次检测时间的窗口大小
int minutes = Minutes.minutesBetween(lastExecuteTime, checkTime).getMinutes();
boolean[] hasExecute = new boolean[] {false};
alarmRulesWatcher.getRunningContext().values().forEach(ruleList -> ruleList.forEach(runningRule -> {
if (minutes > 0) { 窗口在同一分钟,不在进行告警
runningRule.moveTo(checkTime);
当前分钟的前15秒避免检测,防止上一分钟数据比例过高误报
if (checkTime.getSecondOfMinute() > 15) {
hasExecute[0] = true;
检测所有的应该报警的信息
alarmMessageList.addAll(runningRule.check());
}
}
}));
if (hasExecute[0]) {
lastExecuteTime = checkTime.minusSeconds(checkTime.getSecondOfMinute());
}
存在报警信息则执行回调
if (alarmMessageList.size() > 0) {
allCallbacks.forEach(callback -> callback.doAlarm(alarmMessageList));
}
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}, 10, 10, TimeUnit.SECONDS);
}
RunningRule.in数据写入
- agent上报的metrics数据上报至RunningRule.windows的Window中
- RunningRule不仅仅有配置规则信息,最重要的是有当前的窗口数据
- 告警依赖创建数据与规则进行比较
一个指标在period时间内,满足超过threshold的数量大于countThreshold次数,并且在silencePeriod周期内没有发生过则进行报警
public class RunningRule {
private static final Logger logger = LoggerFactory.getLogger(RunningRule.class);
private static DateTimeFormatter TIME_BUCKET_FORMATTER = DateTimeFormat.forPattern("yyyyMMddHHmm");
private String ruleName;
private int period; // 窗口大小 单位分钟
private String metricsName;
private final Threshold threshold; //告警阈值
private final OP op;
private final int countThreshold; // 一个时间窗口内达到告警阈值几次进行告警
private final int silencePeriod; // 抑制周期
// Metrics会
private Map<MetaInAlarm /* 指标的一些元信息 */, Window/* 规则配置的时间窗口,内含多个样本窗口 */> windows;
private volatile MetricsValueType valueType;
private int targetScopeId;
private List<String> includeNames;// 一般做应用名,哪些应用名该规则处理
private List<String> excludeNames;
private AlarmMessageFormatter formatter;
}
-获取指标名对应的窗口,写入metrics
public void in(MetaInAlarm meta, Metrics metrics) {
...... 删除大量代码
Window window = windows.get(meta);
if (window == null) {
window = new Window(period);
LocalDateTime timebucket = TIME_BUCKET_FORMATTER.parseLocalDateTime(metrics.getTimeBucket() + "");
window.moveTo(timebucket);
windows.put(meta, window);
}
写入窗口
window.add(metrics);
}
RunningRule.check告警检查
- 遍历所有的告警规则元信息对应的窗口
- 调用窗口window.checkAlarm进行检查
public List<AlarmMessage> check() {
List<AlarmMessage> alarmMessageList = new ArrayList<>(30);
windows.entrySet().forEach(entry -> {
一个指标元信息
MetaInAlarm meta = entry.getKey();
一个指标窗口
Window window = entry.getValue();
窗口内部告警检测
AlarmMessage alarmMessage = window.checkAlarm();
存在告警消息
if (alarmMessage != AlarmMessage.NONE) {
alarmMessage.setScopeId(meta.getScopeId());
alarmMessage.setScope(meta.getScope());
alarmMessage.setName(meta.getName());
alarmMessage.setId0(meta.getId0());
alarmMessage.setId1(meta.getId1());
alarmMessage.setRuleName(this.ruleName);
alarmMessage.setAlarmMessage(formatter.format(meta));
alarmMessage.setStartTime(System.currentTimeMillis());
alarmMessageList.add(alarmMessage);
}
});
return alarmMessageList;
}
源码分析一Window[核心]
- 核心,提供属性values存储agent上报的数据
- 核心,提供checkAlarm检测告警
- 核心,提供isMatch执行告警规则匹配
- 核心,提供moveTo滑动固定窗口
moveTo
原理图
- 当current大于endtime 则需要移动窗口
- 并更新endTime = current
- 滑动的实质是修改values链表,删除头节点,添加尾节点
- 窗口内部是一个链表,每个节点表示[1分钟]的上报数据[Metrics]
- 如下图,原来的窗口数据完全被丢弃,新窗口根据current时间直接滑动出原来的窗口,然后更新endTime到current位置
源码
根据传入的时间和endTime的差值更新values 简单说就是固定窗口随时间滑动
public void moveTo(LocalDateTime current) {
lock.lock();
try {
if (endTime == null) {
init();
endTime = current;
} else {
int minutes = Minutes.minutesBetween(endTime, current).getMinutes();
if (minutes <= 0) {
// 一个value占一分钟
// values
// ------|-----------|---------------
// 窗口起点 endtime
// |
// current
// 说明endTime无需移动
return;
}
if (minutes > values.size()) { // 说明现在的窗口太老,新建一批窗口
// 下一个窗口大小
// ------|-----------|------------|---------
// 窗口起点 endtime
// |
// current
// 说明values窗口时间太旧
// // ------ ----------- --|----------|---
// 新窗口起点 |
// 新窗口endtime
// |
// current
init();
} else {
// 下一个窗口大小
// ------|-----------|------------|---------
// 窗口起点 endtime |
// |
// current
//
// values部分太老
// === 这部分老窗口未彻底过期 保留
// ========这部分老窗口数据删除
// --------|-------|--|----------|-----------
// 窗口起点 endtime
// |
// current
for (int i = 0; i < minutes; i++) {
// 固定窗口滑动 删除历史的采样 新增现在的采样
values.removeFirst();
values.addLast(null);
}
}
endTime = current;
}
} finally {
lock.unlock();
}
}
checkAlarm
- isMatch匹配告警规则成功还需要检测是否超过countThreshold次数
- 同时还需要判断是否不在静默抑制期内
- 只有满足告警规则匹配,匹配次数,以及大于静默时间才会触发通知
public AlarmMessage checkAlarm() {
存在告警匹配
if (isMatch()) { // 产生阈值超出
经历多次检测 counter才会满足应告警次数大于countThreshold,才会触发告警通知
counter++;
满足告警
if (counter >= countThreshold && silenceCountdown < 1) {
silenceCountdown = silencePeriod;
返回告警信息
AlarmMessage message = new AlarmMessage();
return message;
} else {
减少抑制周期
silenceCountdown--;
}
} else {
// 抑制周期减少
silenceCountdown--;
if (counter > 0) {
counter--;
}
}
return AlarmMessage.NONE;
}
总结
- alarm是分钟级别,所以告警为准实时,发生问题到告警会存在滞后性