----alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:465' # 定义163邮箱服务器端
smtp_from: 'xxxxxxxxxx@163.com' #来自哪个邮箱发的
smtp_auth_username: 'xxxxxxxxxx@163.com' #邮箱验证
smtp_auth_password: 'xxxxxxxxxxx' # 邮箱授权码,不是登录密码
smtp_require_tls: false # 是否启用tls
route:
group_by: ['alertname'] #告警分组
group_wait: 5s #如果在等待时间内当前group接收到了新的告警,这些告警将会合并为一个通知向receiver发送。
group_interval: 5s #用于定义相同的Gourp之间发送告警通知的时间间隔。
repeat_interval: 5m # 发送报警间隔,如果指定时间内没有修复,则重新发送报警。
receiver: 'email' #发送的告警媒体
receivers:
- name: 'email' # 接收者配置,这里要与接收媒体一致
email_configs:
- to: 'xxxxxxxxxxxxx@163.com' #发送给谁的邮箱,多个人多行列出
send_resolved: true # 是否通知已解决的警报。
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
----prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- xxx.xxx.xxx:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- /opt/prometheus-2.37.0.linux-amd64/rules/*.yml
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["xxx.xxx.xxx:9090"]
- job_name: "node_export"
static_configs:
- targets: ["xxx.xxx.xxx:9100","xxx.xxx.xxx:9100","xxx.xxx.xxx:9100"]
rules
groups:
- name: node-alert
rules:
- alert: node status is WODN
expr: up{job="node_export"} == 0
for: 2m
labels:
severity: emergency
instance: "{{ $labels.instance }}"
annotations:
summary: "node: {{ $labels.instance }} down"
description: "{{$labels.instance}} down more than 5 minutes"
value: "{{ $value }}"
- name: mem-used
rules:
- alert: node mem_used > 50%
expr: (((node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes) / (node_memory_MemTotal_bytes )) * 100 ) > 50
for: 2m
labels:
severity: emergency
instance: "{{ $labels.instance }}"
annotations:
summary: "node: {{ $labels.instance }} mem_used > 50%"
description: "{{$labels.instance}} down more than 2 minutes"
value: "{{ $value }}"
- name: disk-used
rules:
- alert: node disk_used > 50%
expr: (((node_filesystem_size_bytes-node_filesystem_free_bytes) *100/(node_filesystem_avail_bytes +(node_filesystem_size_bytes-node_filesystem_free_bytes)))) > 30
for: 2m
labels:
severity: emergency
instance: "{{ $labels.instance }}"
annotations:
summary: "node: {{ $labels.instance }} disk_used > 50%"
description: "{{$labels.instance}} down more than 2 minutes"
value: "{{ $value }}"
配置多route,多receiver
alertmanager.yml
route:
group_by: ['alertname'] #根据告警名分组发送
group_wait: 30s
group_interval: 31m
repeat_interval: 1h
receiver: 'service' #默认接收器
routes:
- receiver: "kafka_consumergroup_lag"
match_re:
service: AA|BB #规则文件中的标签,匹配到就走这个联系人
- receiver: "disk_usage"
match_re:
machine_room: aa|bb|cc
receivers:
- name: 'disk_usage' #接收器名称
email_configs: #邮件类型告警
- to: 'xxx.com' #发送给谁
send_resolved: false #恢复邮件是否发送
headers: { Subject: "elk_server_disk_high_usage" } #不同的规则到不同的联系人,对应不同的邮件主题
- name: 'kafka_consumergroup_lag'
email_configs:
- to: 'xxx.com'
send_resolved: false
headers: { Subject: "elk_kafka_consumergroup_lag too high" }
- name: 'service'
email_configs:
- to: 'xxx.com'
send_resolved: false
headers: { Subject: "elk_service_down" }