prometheus监控es,同样采用exporter的方案。
项目地址:
elasticsearch_exporter:https://github.com/justwatchcom/elasticsearch_exporter
默认端口 9114
1、安装部署
【1.0】封装成系统服务、一键部署
前提,把二进制包复制过来放到当前目录
vim es_exporter_install.sh
#!/bin/bash
init(){
es_path_config=`ps -ef|grep elastic|grep "Des.path.conf"|sed 's# #\n#g'|grep "Des.path.conf"|awk -F'=' '{print $2}'`/
configfile=elasticsearch.yml
ip=`cat ${es_path_config}${configfile}|grep network.host|awk -F":" '{print $2}'|sed 's/[[:space:]]//g'`
port=`cat ${es_path_config}${configfile}|grep http.port|awk -F":" '{print $2}'|sed 's/[[:space:]]//g'`
if [ ! "$ip" -o ! "$port" ];then
echo "init is error,can't get the es's ip and port!"
exit 1
fi
if [ $ip == '0.0.0.0' ];then
ip=127.0.0.1
fi
tar -zxf elasticsearch_exporter-1.1.0.linux-amd64.tar.gz
mv elasticsearch_exporter-1.1.0.linux-amd64 /usr/local/elasticsearch_exporter
groupadd prometheus
useradd -g prometheus -m -d /var/lib/prometheus -s /sbin/nologin prometheus
chown -R prometheus.prometheus /usr/local/elasticsearch_exporter
}
run(){
if [ `uname -a |grep el7|wc -l` -eq 1 ];then
cat << eof >/lib/systemd/system/es_exporter.service
[Unit]
Description=The es_exporter
After=network.target
[Service]
PrivateTmp=true
Type=simple
User=prometheus
ExecStart=/usr/local/elasticsearch_exporter/elasticsearch_exporter --es.uri=http://${ip}:${port}
Restart=on-failure
ExecStop=/bin/kill -s QUIT $MAINPID
[Install]
WantedBy=multi-user.target
eof
systemctl daemon-reload
systemctl start es_exporter
systemctl enable es_exporter
elif [ `uname -a |grep el6|wc -l` -eq 1 ];then
cat << eof >/etc/init.d/es_exporter
#!/bin/bash
# chkconfig: 2345 10 90
# description: es's exporter
touch /var/log/es_exporter.log
chown prometheus.prometheus /var/log/es_exporter.log
es_path_config=`ps -ef|grep elastic|grep "Des.path.conf"|sed 's# #\n#g'|grep "Des.path.conf"|awk -F'=' '{print $2}'`/
configfile=elasticsearch.yml
ip=`cat ${es_path_config}${configfile}|grep network.host|awk -F":" '{print $2}'|sed 's/[[:space:]]//g'`
port=`cat ${es_path_config}${configfile}|grep http.port|awk -F":" '{print $2}'|sed 's/[[:space:]]//g'`
if [ $ip == '0.0.0.0' ];then
ip=127.0.0.1
fi
su prometheus -s /bin/bash -c "/usr/local/elasticsearch_exporter/elasticsearch_exporter --es.uri=http://${ip}:${port} &" >> /var/log/es_exporter.log
eof
chown prometheus.prometheus /etc/init.d/es_exporter
chmod +x /etc/init.d/es_exporter
chkconfig --add es_exporter
chkconfig --level 3 es_exporter on
service es_exporter start
else
echo "your os not rel7/rel6,operator fail!"
fi
}
main(){
init
run
}
main
ps -ef|grep elasticsearch_exporter
sh es_exporter_install.sh
【1.2】简便一键部署、脚本方式
前提:把ES执行命令直接拿来放到一起
vim install_es.sh
#!/bin/bash
mv elasticsearch_exporter /bin/elasticsearch_exporter
chmod +x /bin/elasticsearch_exporter
es_path_config=`ps -ef|grep elastic|grep "Des.path.conf"|sed 's# #\n#g'|grep "Des.path.conf"|awk -F'=' '{print $2}'`/
configfile=elasticsearch.yml
ip=`cat ${es_path_config}${configfile}|grep network.host|awk -F":" '{print $2}'|sed 's/[[:space:]]//g'`
port=`cat ${es_path_config}${configfile}|grep http.port|awk -F":" '{print $2}'|sed 's/[[:space:]]//g'`
if [ ! "$ip" -o ! "$port" ];then
echo 'init is error,can't get the es's ip and port!'
exit 1
fi
if [ $ip == '0.0.0.0' ];then
ip=127.0.0.1
fi
echo "nohup /bin/elasticsearch_exporter --es.uri="http://${ip}:${port}" --web.listen-address="0.0.0.0:9114" >>/var/log/es_exporter.log 2>&1 & "
nohup /bin/elasticsearch_exporter --es.uri="http://${ip}:${port}" --web.listen-address="0.0.0.0:9114" >> /var/log/es_exporter.log 2>&1 &
echo "nohup /bin/elasticsearch_exporter --es.uri="http://${ip}:${port}" --web.listen-address="0.0.0.0:9114" >>/var/log/es_exporter.log 2>&1 & " >>/etc/rc.local
ps -ef|grep elasticsearch_exporter
【1.2】详细步骤
接着分别在如上三台主机上进行如下配置:
wget https://github.com/justwatchcom/elasticsearch_exporter/releases/download/v1.1.0/elasticsearch_exporter-1.1.0.linux-amd64.tar.gz
tar -zxf elasticsearch_exporter-1.1.0.linux-amd64.tar.gz
mv elasticsearch_exporter-1.1.0.linux-amd64 /usr/local/elasticsearch_exporter
创建用户等
groupadd prometheus
useradd -g prometheus -m -d /var/lib/prometheus -s /sbin/nologin prometheus
chown -R prometheus.prometheus /usr/local/elasticsearch_exporter
启动监控客户端:
nohup ./elasticsearch_exporter --web.listen-address ":9114" --es.uri http://192.168.75.21:9200 &
使用systemd管理:
cat << eof >>/lib/systemd/system/es_exporter.service
[Unit]
Description=The es_exporter
After=network.target
[Service]
Type=simple
User=prometheus
ExecStart=/usr/local/elasticsearch_exporter/elasticsearch_exporter
Restart=on-failure
ExecStop=/bin/kill -s QUIT $MAINPID
[Install]
WantedBy=multi-user.target
eof
启动:
systemctl daemon-reload
systemctl start es_exporter
systemctl enable es_exporter
查看metrics:
curl 127.0.0.1:9114/metrics
2、配置 prometheus.yml 添加监控目标
vim /usr/local/prometheus/prometheus.yml
- job_name: 'elasticsearch'
scrape_interval: 60s
scrape_timeout: 30s
metrics_path: "/metrics"
static_configs:
- targets: ['192.168.75.21:9308']
labels:
service: elasticsearch
重启服务。
systemctl restart prometheus
或者通过命令热加载:
curl -XPOST localhost:9090/-/reload
3、配置 Grafana 的模板
模板通过json文件进行导入,文件就在解压的包内。
参考地址:https://shenshengkun.github.io/posts/550bdf86.html
或者通过如下ID进行导入:2322
以及其他。
4、开启认证的启动方式
如果es开启了认证,那么启动的时候需要将用户名密码加载进去:
elasticsearch_exporter --web.listen-address ":9308" --es.uri http://username:password@192.168.75.21:9200 &
其中使用的是monitoring
的用户密码。
当然,除去这种命令行的启动方式之外,还可以像上边一样,基于systemd进行管理,只需将认证的参数信息写入到如下内容当中:
参考网址:https://github.com/justwatchcom/elasticsearch_exporter
cat /etc/default/elasticsearch_exporter
[Unit]
Description=The es_exporter
After=network.target
[Service]
Type=simple
User=prometheus
ExecStart=/usr/local/elasticsearch_exporter/elasticsearch_exporter --web.listen-address ":9308" --es.uri=http://username:password@192.168.75.21:9200
Restart=on-failure
[Install]
WantedBy=multi-user.target
【5】【最佳实践】es_alert.yml
groups:
- name: ES告警
rules:
- alert: ES-集群状态变红
expr: elasticsearch_cluster_health_status{color="red"}==1
for: 1m
labels:
severity: warning
annotations:
description: "主/副本分片分配有误,该问题发生在集群:{{ $labels.cluster }}"
- alert: ES-集群状态变黄
expr: elasticsearch_cluster_health_status{color="yellow"}==1
for: 1m
labels:
severity: warning
annotations:
description: "主/副本分片分配有误,该问题发生在集群:{{ $labels.cluster }}."
- alert: ES-JVM堆内存使用过高
expr: round(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}*100,0.01)>85
for: 1m
labels:
severity: warning
annotations:
description: "JVM堆内存使用率超过80%\n当前:{{ $value }}"
- alert: ES-集群健康状态获取失败
expr: elasticsearch_cluster_health_up!=1
for: 1m
labels:
severity: warning
annotations:
description: "该ES节点,获取集群监控状态失败 in cluster:[ {{ $labels.cluster }} ]"
- alert: ES-太少节点运行
expr: elasticsearch_cluster_health_number_of_nodes < 5
for: 1m
labels:
severity: warning
annotations:
description: "ES集群运行的节点<5个(total 7) in cluster:[ {{ $labels.cluster }} ]\n当前运行节点个数:{{ $value }}"
- alert: ES-GC平均执行次数过多
expr: rate(elasticsearch_jvm_gc_collection_seconds_count{}[5m])>5
for: 1m
labels:
severity: warning
annotations:
description: "JVM GC 1m内平均执行次数>5/s in cluster:[ {{ $labels.cluster }} ]\n当前:{{ $value }}/s"
- alert: ES-GC平均运行时间过长
expr: round((node_filesystem_size_bytes{fstype=~"ext.?|xfs"} - node_filesystem_free_bytes{fstype=~"ext.?|xfs"}) * 100 / (node_filesystem_avail_bytes{fstype=~"ext.?|xfs"} + (node_filesystem_size_bytes{fstype=~"ext.?|xfs"} - node_filesystem_free_bytes{fstype=~"ext.?|xfs"})),0.1) > 90
for: 1m
labels:
severity: warning
annotations:
description: "ES 1m 内平均运行时间>0.3/s in cluster:[ {{ $labels.cluster }} ]\n当前:{{ $value }}/s"
- alert: ES-JSON解析失败
expr: elasticsearch_cluster_health_json_parse_failures>0
for: 5m
labels:
severity: warning
annotations:
description: "ES节点解析json失败数 > 0 in cluster:[ {{ $labels.cluster }} ]\n当前:{{ $value }}"
- alert: ES-断路器触发
expr: rate(elasticsearch_breakers_tripped{}[5m])>0
for: 1m
labels:
severity: warning
annotations:
description: "ES 断路器触发数 in cluster:[ {{ $labels.cluster }} ]> 0\n当前:{{ $value }}"
- alert: ES-等待进程过多
expr: elasticsearch_cluster_health_number_of_pending_tasks>10
for: 1m
labels:
severity: warning
annotations:
description: "ES pending_tasks in cluster:[ {{ $labels.cluster }} ] > 10\n当前:{{ $value }}"
- alert: ES-增加集群节点
expr: increase(elasticsearch_cluster_health_number_of_nodes[1m]) > 0
for: 1s
labels:
severity: warning
annotations:
description: "ES-增加集群节点 in cluster:[ {{ $labels.cluster }} ]\n增加个数:{{ $value }}"
- alert: ES-减少集群节点
expr: increase(elasticsearch_cluster_health_number_of_nodes[1m]) > 0
for: 1s
labels:
severity: warning
annotations:
description: "ES-减少集群节点 in cluster:[ {{ $labels.cluster }} ]\n减少个数:{{ $value }}"