相关参考地址:
https://github.com/prometheus-community/prometheus-operator.git
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack
1.概述
Kube Prometheus Stack是一个基于Prometheus和Grafana的监控解决方案,用于监控Kubernetes集群中的各种资源和服务.
Prometheus Stack,通常指的是 Prometheus和Grafana 以及相关关联集成组件的统称。 在实际的业务场景中,Prometheus 和 Grafana 往往都是协同⼯作进⾏监控 渲染: Prometheus 负责作为数据源获取数据, 并将该数据提供给 Grafana,Grafana 则⽤于借助其吸引⼒的仪表板进⾏可视化数据展示.
2.prometheus-operator 解决问题
prometheus-operator使用K8s的CRD,简化了Prometheus、Alertmanager以及相关监控组件的部署和配置。
3.自定义CRD介绍
CustomResourceDefinitions
- Prometheus:定义了prometheus的部署
- Alertmanager:定义了Alertmanager的部署
- Probe:prometheus的采集规则,目标地址为静态,即地址写死到配置,用于采集集群外部服务
- ServiceMonitor:prometheus的采集规则,使用endpoints服务发现的方式找到目标地址,用于采集集群内部服务
- PodMonitor:prometheus的采集规则,使用pod服务发现的方式找到目标地址,用于采集集群内部服务
- PrometheusRule:prometheus的告警规则
- AlertmanagerConfig:alertmanager的配置
4.安装部署prometheus-stack
4.1 Prerequisites 环境依赖
Kubernetes 1.19+
Helm 3+
K8S实验环境: v1.26.0
4.2 添加helm 源
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update prometheus-communit
4.3 Install Helm Chart (prometheus-stack)
注意:
1.域名配置依赖nginx-ingress 服务需要提前进行部署nginx-ingress (ingressClass不要修改)
部署nginx-ingress
kuebctl create ns ingress-nginx
helm install ingress-nginx ingress-nginx/ingress-nginx \
--namespace ingress-nginx \
--set controller.metrics.enabled=true \
--set controller.metrics.serviceMonitor.enabled=true
简单demo版本
# 查看定制特殊配置项
helm show values prometheus-community/kube-prometheus-stack
#创建监控指定ns
kubectl create ns monitoring
#helm 默认安装
helm install prometheus-stack prometheus-community/kube-prometheus-stack -n monitoring
#携带定制 配置项安装 检查语法参数 (开启相关ingress 域名并配置定制Http域名)
--dry-run=server --dry-run=client 检查语法
helm install test-stack -n monitoring prometheus-community/kube-prometheus-stack \
--set alertmanager.ingress.enabled=true
--set alertmanager.ingress.ingressClassName="nginx" \
--set alertmanager.ingress.hosts[0]=alertmanager.k8s.local \
--set grafana.ingress.enabled=true \
--set grafana.ingress.ingressClassName="nginx" \
--set grafana.adminPassword="admin" \
--set grafana.ingress.hosts[0]=grafana.k8s.local \
--set prometheus.ingress.enabled=true \
--set prometheus.ingress.ingressClassName="nginx" \
--set prometheus.ingress.hosts[0]=prometheus.k8s.local
#更多参数请参考
helm show values prometheus-community/kube-prometheus-stack
日常使用组件版本
helm values 文件 prometheus-stack-values.yaml
alertmanager:
ingress:
enabled: true
ingressClassName: "nginx"
hosts:
- alertmanager.k8s.local
grafana:
adminPassword: "admin"
ingress:
enabled: true
ingressClassName: "nginx"
hosts:
- grafana.k8s.local
prometheus:
ingress:
enabled: true
ingressClassName: "nginx"
hosts:
- prometheus.k8s.local
thanosService:
enabled: true
prometheusSpec:
thanos:
objectStorageConfig:
name: thanos-objstore-config
key: thanos.yaml
serviceMonitorSelector:
matchLabels:
release: prometheus-stack
thanos:
create: true
storeGateway:
enabled: true
bucketweb:
enabled: true
compactor:
enabled: true
query:
enabled: true
ingress:
enabled: true
ingressClassName: "nginx"
hosts:
- thanos-query.k8s.local
queryFrontend:
enabled: true
ingress:
enabled: true
ingressClassName: "nginx"
hosts:
- thanos-query-frontend.k8s.local
ruler:
enabled: true
ingress:
enabled: true
ingressClassName: "nginx"
hosts:
- thanos-ruler.k8s.local
sidecar:
enabled: true
grpcServerTlsConfig:
enabled: true
#安装&升级
helm upgrade --install prometheus-stack prometheus-community/kube-prometheus-stack --namespace monitoring --values prometheus-stack-values.yaml
4.4 安装后服务-验证检查
#helm 检查安装版本信息
[root@k8s-master kube-prometheus-stack]# helm list -n monitoring
NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION
prometheus-adapter monitoring 1 2024-08-25 20:16:39.579229609 +0800 CST deployed prometheus-adapter-4.11.0 v0.12.0
prometheus-blackbox-exporter monitoring 8 2024-08-26 01:18:29.585619494 +0800 CST deployed prometheus-blackbox-exporter-9.0.0 v0.25.0
prometheus-pushgateway monitoring 1 2024-08-25 20:16:37.056822102 +0800 CST deployed prometheus-pushgateway-2.14.0 v1.9.0
prometheus-stack monitoring 2 2024-08-26 00:23:53.115153858 +0800 CST deployed kube-prometheus-stack-62.3.0 v0.76.0
#检查安装服务启动
[root@k8s-master kube-prometheus-stack]# kubectl get all -n monitoring
NAME READY STATUS RESTARTS AGE
pod/alertmanager-prometheus-stack-kube-prom-alertmanager-0 2/2 Running 0 5h12m
pod/prometheus-adapter-55f67cd7d4-nll6h 1/1 Running 0 5h12m
pod/prometheus-blackbox-exporter-77bc56cbcb-wxcp9 1/1 Running 0 75m
pod/prometheus-prometheus-stack-kube-prom-prometheus-0 2/2 Running 0 75m
pod/prometheus-pushgateway-75cd446bc8-6sndk 1/1 Running 0 5h12m
pod/prometheus-stack-grafana-694c67f857-sr7dv 3/3 Running 0 5h12m
pod/prometheus-stack-kube-prom-operator-6d4c465dcd-wr4zq 1/1 Running 0 3h11m
pod/prometheus-stack-kube-state-metrics-5d46f7859d-57qcb 1/1 Running 0 5h12m
pod/prometheus-stack-prometheus-node-exporter-2brrr 1/1 Running 0 5h12m
pod/prometheus-stack-prometheus-node-exporter-bkmt6 1/1 Running 0 5h12m
pod/prometheus-stack-prometheus-node-exporter-jc298 1/1 Running 0 5h12m
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
service/alertmanager-operated ClusterIP None <none> 9093/TCP,9094/TCP,9094/UDP 5h12m
service/prometheus-adapter ClusterIP 10.107.139.84 <none> 443/TCP 5h12m
service/prometheus-blackbox-exporter ClusterIP 10.109.3.237 <none> 9115/TCP 5h12m
service/prometheus-operated ClusterIP None <none> 9090/TCP 5h12m
service/prometheus-pushgateway ClusterIP 10.96.196.200 <none> 9091/TCP 5h12m
service/prometheus-stack-grafana ClusterIP 10.97.99.203 <none> 80/TCP 5h12m
service/prometheus-stack-kube-prom-alertmanager ClusterIP 10.111.179.254 <none> 9093/TCP,8080/TCP 5h12m
service/prometheus-stack-kube-prom-operator ClusterIP 10.98.95.58 <none> 443/TCP 5h12m
service/prometheus-stack-kube-prom-prometheus ClusterIP 10.98.217.181 <none> 9090/TCP,8080/TCP 5h12m
service/prometheus-stack-kube-prom-thanos-discovery ClusterIP None <none> 10901/TCP,10902/TCP 5h12m
service/prometheus-stack-kube-state-metrics ClusterIP 10.102.160.121 <none> 8080/TCP 5h12m
service/prometheus-stack-prometheus-node-exporter ClusterIP 10.100.228.133 <none> 9100/TCP 5h12m
NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE
daemonset.apps/prometheus-stack-prometheus-node-exporter 3 3 3 3 3 kubernetes.io/os=linux 5h12m
NAME READY UP-TO-DATE AVAILABLE AGE
deployment.apps/prometheus-adapter 1/1 1 1 5h12m
deployment.apps/prometheus-blackbox-exporter 1/1 1 1 5h12m
deployment.apps/prometheus-pushgateway 1/1 1 1 5h12m
deployment.apps/prometheus-stack-grafana 1/1 1 1 5h12m
deployment.apps/prometheus-stack-kube-prom-operator 1/1 1 1 5h12m
deployment.apps/prometheus-stack-kube-state-metrics 1/1 1 1 5h12m
NAME DESIRED CURRENT READY AGE
replicaset.apps/prometheus-adapter-55f67cd7d4 1 1 1 5h12m
replicaset.apps/prometheus-blackbox-exporter-5567c558cf 0 0 0 138m
replicaset.apps/prometheus-blackbox-exporter-55c969f74f 0 0 0 5h12m
replicaset.apps/prometheus-blackbox-exporter-5766fc865 0 0 0 133m
replicaset.apps/prometheus-blackbox-exporter-77bc56cbcb 1 1 1 132m
replicaset.apps/prometheus-pushgateway-75cd446bc8 1 1 1 5h12m
replicaset.apps/prometheus-stack-grafana-694c67f857 1 1 1 5h12m
replicaset.apps/prometheus-stack-kube-prom-operator-6d4c465dcd 1 1 1 5h12m
replicaset.apps/prometheus-stack-kube-state-metrics-5d46f7859d 1 1 1 5h12m
NAME READY AGE
statefulset.apps/alertmanager-prometheus-stack-kube-prom-alertmanager 1/1 5h12m
statefulset.apps/prometheus-prometheus-stack-kube-prom-prometheus 1/1 5h12m
#域名检查
[root@k8s-master kube-prometheus-stack]# kubectl get ing -n monitoring |grep -E "*.k8s.local"
prometheus-blackbox-exporter nginx blackbox.k8s.local 80 139m
prometheus-stack-grafana nginx grafana.k8s.local 80 5h13m
prometheus-stack-kube-prom-alertmanager nginx alertmanager.k8s.local 80 5h13m
prometheus-stack-kube-prom-prometheus nginx prometheus.k8s.local 80 5h13m
#检查生成的CRD资源
[root@]# kubectl get crd |grep monitoring
[root@k8s-master kube-prometheus-stack]# kubectl get crd |grep monitoring
alertmanagerconfigs.monitoring.coreos.com 2024-07-29T14:57:43Z
alertmanagers.monitoring.coreos.com 2024-07-29T14:57:44Z
podmonitors.monitoring.coreos.com 2024-07-29T14:57:44Z
probes.monitoring.coreos.com 2024-07-29T14:57:44Z
prometheusagents.monitoring.coreos.com 2024-07-29T14:57:44Z
prometheuses.monitoring.coreos.com 2024-07-29T14:57:44Z
prometheusrules.monitoring.coreos.com 2024-07-29T14:57:44Z
scrapeconfigs.monitoring.coreos.com 2024-07-29T14:57:44Z
servicemonitors.monitoring.coreos.com 2024-07-29T14:57:45Z
thanosrulers.monitoring.coreos.com 2024-07-29T14:57:45Z
#检查集群生成的资源对象清单
[root@]# kubectl api-resources |grep -E "alertmanagerconfigs|alertmanagers|podmonitors|probes|prometheusagents|prometheuses|prometheusrules|scrapeconfigs|servicemonitors"
alertmanagerconfigs amcfg monitoring.coreos.com/v1alpha1 true AlertmanagerConfig
alertmanagers am monitoring.coreos.com/v1 true Alertmanager
podmonitors pmon monitoring.coreos.com/v1 true PodMonitor
probes prb monitoring.coreos.com/v1 true Probe
prometheusagents promagent monitoring.coreos.com/v1alpha1 true PrometheusAgent
prometheuses prom monitoring.coreos.com/v1 true Prometheus
prometheusrules promrule monitoring.coreos.com/v1 true PrometheusRule
scrapeconfigs scfg monitoring.coreos.com/v1alpha1 true ScrapeConfig
servicemonitors smon monitoring.coreos.com/v1 true ServiceMonitor
4.5 查看监控面板
5.配置常见监控方式
5.1 Probe 外部方式(选择1)
#外部方式
prometheus的采集规则,目标地址为静态,即地址写死到配置,用于采集集群外部服务
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
labels:
app: nginx-ingress
release: prometheus-stack
name: nginx-ingress
namespace: ingress-nginx
spec:
prober:
url: "10.0.0.12:10254"
scheme: http
path: "/metrics"
targets:
staticConfig:
static: ["10.0.0.12:10254", "10.0.0.13:10254"]
5.2 ServiceMonitor 采集集群内部服务(选择2)
#监控内部Service 服务
prometheus的采集规则,
使用endpoints服务发现的方式找到目标地址,用于采集集群内部服务.
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: nginx-ingress
namespace: monitoring
labels:
release: prometheus-stack
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.10.0
helm.sh/chart: ingress-nginx-4.10.0
spec:
jobLabel: app.kubernetes.io/component
endpoints:
- port: metrics
path: /metrics
interval: 10s
selector:
matchLabels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
namespaceSelector:
any: true
5.3 采集集群内部服务-验证
5.4 查看验证nginx-ingress 指标
#nginx 统计5分钟域名访问2xx状态码
sum(increase(nginx_ingress_controller_response_duration_seconds_count{status=~'2..'}[5m])) by (host)
#nginx 统计5分钟域名访问5xx状态码
sum(increase(nginx_ingress_controller_response_duration_seconds_count{status=~'2..'}[5m])) by (host)
#nginx/请求P 99/P95 P50延迟
histogram_quantile(0.99,sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{}[2m])) by (le,host))
histogram_quantile(0.90,sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{}[2m])) by (le,host))
histogram_quantile(0.50,sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{}[2m])) by (le,host))
5.5 配置PodMonitor示例
5.5.1 示例应用
---
apiVersion: apps/v1
kind: Deployment
metadata:
annotations:
prometheus.io/scrape: 'true'
prometheus.io/port: '8080'
prometheus.io/path: '/metrics'
name: example-app
spec:
replicas: 3
selector:
matchLabels:
app: example-app
template:
metadata:
labels:
app: example-app
spec:
containers:
- name: example-app
image: quay.io/brancz/prometheus-example-app:v0.5.0
ports:
- name: web
containerPort: 8080
5.5.2 PodMonitor规则
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: example-app
labels:
release: prometheus-stack
team: frontend
spec:
selector:
matchLabels:
prometheus.io/scrape: "true"
app: example-app
podMetricsEndpoints:
- port: web
5.5.3 验证PodMonitor
5.6. 配置服务探测
Blackbox Exporter 介绍
服务探测 依赖于 Blackbox Exporter是Prometheus社区提供的官方黑盒监控解决方案,
其允许用户通过:HTTP、HTTPS、DNS、TCP以及ICMP的方式对网络进行探测。
Blackbox Exporter 部署
prometheus-blackbox-exporter.values.yaml #values 配置文件
releaseLabel: true
config:
modules:
http_2xx:
prober: http
timeout: 5s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
follow_redirects: true
preferred_ip_protocol: "ip4"
http_post_2xx:
http:
method: POST
preferred_ip_protocol: ip4
prober: http
timeout: 5s
icmp:
icmp:
preferred_ip_protocol: ip4
prober: icmp
timeout: 3s
tcp:
prober: tcp
tcp:
preferred_ip_protocol: ip4
timeout: 5s
service:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9115"
prometheus.io/path: "/metrics"
labels: {}
type: ClusterIP
port: 9115
ipDualStack:
enabled: false
ipFamilies: ["IPv6", "IPv4"]
ipFamilyPolicy: "PreferDualStack"
ingress:
enabled: true
className: "nginx"
annotations:
kubernetes.io/ingress.class: "nginx"
nginx.ingress.kubernetes.io/rewrite-target: /
hosts:
- host: blackbox.k8s.local
paths:
- path: /
pathType: ImplementationSpecific
serviceMonitor:
enabled: true
selector:
matchLabels:
app: "prometheus-blackbox-exporter"
endpoints:
- port: "http"
path: "/metrics"
interval: 30s
scrapeTimeout: 10s
defaults:
interval: 30s
scrapeTimeout: 30s
module: http_2xx
selfMonitor:
enabled: true
path: /metrics
scheme: http
interval: 30s
scrapeTimeout: 30s
targets:
- name: blackbox-exporter-target
url: https://www.baidu.com
labels:
group: example-group-https
#helm 安装
helm upgrade --install prometheus-blackbox-exporter prometheus-community/prometheus-blackbox-exporter
-f prometheus-blackbox-exporter.values.yaml -n monitoring
Blackbox Exporter 服务注册
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: prometheus-blackbox-exporter-job
namespace: monitoring
labels:
release: prometheus-stack
spec:
jobLabel: prometheus-blackbox-exporter-job
endpoints:
- port: http
path: /metrics
interval: 10s
selector:
matchLabels:
app.kubernetes.io/instance: prometheus-blackbox-exporter
app.kubernetes.io/name: prometheus-blackbox-exporter
namespaceSelector:
any: true
5.6.1 探测一部分(第三方域名)TCP检查
参考文档: https://github.com/prometheus/blackbox_exporter/blob/master/example.yml
#外部方式
apiVersion: monitoring.coreos.com/v1 #API版本
kind: Probe #资源类型
metadata:
labels:
app: nginx-ingress
release: prometheus-stack #内部标识
name: external-api-tcp-check #名称
namespace: default
spec:
interval: 15s
module: tcp_check #使用模块 用于TCP检查模块
prober:
path: /probe #Blackbox Exporter 探测路径
url: prometheus-blackbox-exporter.monitoring.svc.cluster.local:9115 #Blackbox Exporter 探测URL
targets:
staticConfig:
static:
- www.baiudu.com:443 #目标列表第一个目标
- www.51cto.com:443 #目标列表第二个目标
- www.google.com:443 #目标列表第三个目标
5.6.2 探测一部分(第三方域名)HTTP检查
#外部方式
apiVersion: monitoring.coreos.com/v1 #API版本
kind: Probe #资源类型
metadata:
labels:
app: external-api-http-check
release: prometheus-stack #内部标识
name: external-api-http-check #名称
namespace: default
spec:
interval: 15s
module: http_2xx #使用模块 用于TCP检查模块
prober:
path: /probe #Blackbox Exporter 探测路径
scheme: http
url: prometheus-blackbox-exporter.monitoring.svc.cluster.local:9115 #Blackbox Exporter 探测URL
targets:
staticConfig:
static:
- https://www.baiudu.com #目标列表第一个目标
- https://www.51cto.com #目标列表第二个目标
- https://www.google.com #目标列表第三个目标
6.配置prometheus告警规则
6.1 配置pod 告警规则
6.1.1 pod 非运行状态/重启次数告警
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
release: prometheus-stack
name: prometheus-kube-podnotrunning-rele
namespace: monitoring
spec:
groups:
- name: PodNotRunning
rules:
- alert: PodNotRunningDown
annotations:
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} is not running: {{ $value }}.'
summary: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} not running.'
expr: |-
max_over_time(kube_pod_status_phase{phase!="Running"}[5m]) > 0
for: 1m
labels:
severity: critical
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
release: prometheus-stack
name: prometheus-pod-restart-exceeds-threshold
namespace: monitoring
spec:
groups:
- name: PodRestartExceedsThreshold
rules:
- alert: PodRestartExceedsThreshold
annotations:
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has exceeded the threshold of restart: {{ $value }}"
summary: "Pod restarts exceeded threshold"
expr: |-
kube_pod_container_status_restarts_total
> 0
for: 1m
labels:
severity: critical
6.1.2 验证规则是否生效
7.配置Alertmanager 告警联动
7.1 告警全局配置
---
apiVersion: v1
kind: Secret
metadata:
name: alertmanager-global-config
namespace: monitoring
type: Opaque
stringData:
alertmanager.yaml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
timezone: CST
route:
group_by: ['instance']
group_wait: 10m
group_interval: 10s
repeat_interval: 10m
receiver: 'default-receiver'
receivers:
- name: default-receiver
webhook_configs:
- send_resolved: true
url: "http://alert-center.k8s.local/prometheusalert?type=fs&tpl=prometheus-fs&fsurl=xxxx"
注意
url: "XXXX" #对接配置PrometheusAlert 调用地址
7.2 引用全局配置
kubectl patch alertmanager prometheus-stack-kube-prom-alertmanager \
-n monitoring --type='merge' \
-p '{"spec":{"configSecret":"alertmanager-global-config"}}'
注意
configSecret:引用上一步创建的全局配置
7.3 增加新告警通知规则
---
apiVersion: monitoring.coreos.com/v1alpha1
kind: AlertmanagerConfig
metadata:
name: alert1
spec:
route:
receiver: receiver1
receivers:
- webhookConfigs:
- url: "xxx" #webhook 另外一个通道
name: receiver1
#需要执行全局配置文件和新通知相关yaml 文件.
7.4 验证alertmanager触发告警
需要提前触发监控
8.联动PrometheusAlert
8.1 K8S部署PrometheusAlert
https://github.com/feiyu563/PrometheusAlert #官网
#ingress PrometheusAlert访问域名
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
annotations:
meta.helm.sh/release-name: prometheus-stack
meta.helm.sh/release-namespace: monitoring
name: prometheus-alert-center
namespace: monitoring
spec:
ingressClassName: nginx
rules:
- host: alert-center.k8s.local
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: prometheus-alert-center
port:
number: 8080
部署文件
kubectl apply -f https://raw.githubusercontent.com/feiyu563/PrometheusAlert/master/example/kubernetes/PrometheusAlert-Deployment.yaml
配置变更
kubectl ge cm prometheus-alert-center-conf -n monitoring -o yaml
必须开启飞书告警
open-feishu=1
fsurl="飞书webhook地址"
#是否开启feishuapp告警通道,可同时开始多个通道0为关闭,1为开启
8.2 访问验证(PrometheusAlert)
9.验证webhook告警
9.1 模拟pod 运行失败
9.2 告警群验证
需要自行配置webhook 地址 申请webhook自行配置申请.
9.3 查看grafana (黑盒监控-面板)
https://grafana.com/grafana/dashboards/7587-prometheus-blackbox-exporter/ #id 7587
黑盒监控面板