1:安装alertmanager
cd /usr/local/src
wget https://github.com/prometheus/alertmanager/releases/download/v0.25.0/alertmanager-0.25.0.linux-amd64.tar.gz
tar -zxvf alertmanager-0.25.0.linux-amd64.tar.gz
mv alertmanager-0.25.0.linux-amd64 /usr/local/alertmanager
2:编辑alertmanager配置文件
vim /usr/local/alertmanager/alertmanager.yml
global:
resolve_timeout: 2m
smtp_smarthost: 'smtp.qq.com:25'
smtp_from: '2xxx@qq.com'
smtp_auth_username: '2xxx@qq.com'
smtp_auth_password: 'xxxxxxxxxx' # 邮箱授权码,注意不是邮箱密码
smtp_require_tls: false
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 4m # 发送重复警报的周期
receiver: 'mail' # 发送警报的接收者的名称,以下 receivers name 的名称
receivers:
- name: 'mail' # 警报接收者,与上面的配置相对应
email_configs:
- to: '2xxxxxx@qq.com' # 接收警报的email
headers: { Subject: "[WARN] alertmanager 报警邮件"}
3:编辑prometheus配置文件
vim /usr/local/prometheus/prometheus.yml
........
........
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093 #修改alertmanager的IP和端口
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
- "/usr/local/prometheus/rules/*.yml"
........
........
4:启动alertmanager
cd /usr/local/alertmanager
nohup ./alertmanager --config.file="/usr/local/alertmanager/alertmanager.yml" &
5:设置报警规则
检测节点是否在线
vim /usr/local/prometheus/rules/instance.yml
groups:
- name: example
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: page
annotations:
summary: "实例 {{ $labels.instance }} 宕机"
description: "{{ $labels.instance }} 任务 {{ $labels.job }} 已宕机 1 分钟"
检测cpu、内存、磁盘使用情况
vim /usr/local/prometheus/rules/cpu.rules.yml
groups:
- name: node.rules
rules:
- alert: NodeFilesystemUsage
expr: 100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: "{{$labels.instance}}: {{$labels.mountpoint }} 分区使用过高"
description: "{{$labels.instance}}: {{$labels.mountpoint }} 分区使用大于 80% (当前值: {{ $value }})"
- alert: NodeMemoryUsage
expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: "{{$labels.instance}}: 内存使用过高"
description: "{{$labels.instance}}: 内存使用大于 80% (当前值: {{ $value }})"
- alert: NodeCPUUsage
expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: "{{$labels.instance}}: CPU使用过高"
description: "{{$labels.instance}}: CPU使用大于 80% (当前值: {{ $value }})"
6:查看报警
设置完成后大致等待1-2分钟后,这里会报警,我截图时候已经好了
也会收到邮件报警