使用prometheus来监控ingress-nginx

ingress-nginx配置了metrics

通过标签去查看ingress的pod

root@guoguo-M5-Pro:/apps/k8s/prometheus# kubectl get pods -n kube-system -l app=ingress-nginx -owide
NAME                             READY   STATUS    RESTARTS   AGE   IP              NODE            
nginx-ingress-controller-ds6mq   1/1     Running   7          69d   172.17.20.115   172.17.20.115   
nginx-ingress-controller-lxt8s   1/1     Running   8          69d   172.17.20.114   172.17.20.114   
#分别部署在114 和115机器上 我们要监控这两个pod 的ingress-nginx

过滤一下 暴漏的metrics端口号

root@guoguo-M5-Pro:/apps/k8s/prometheus# kubectl get pods -n kube-system nginx-ingress-controller-ds6mq -o yaml | egrep -A2 "port|metrics"
    prometheus.io/port: "10254"
    prometheus.io/scrape: "true"
  creationTimestamp: "2024-06-16T10:46:41Z"
--
    - --enable-metrics=false
    - --v=2
    env:
--
        port: 10254
        scheme: HTTP
      initialDelaySeconds: 10
--
    ports:
    - containerPort: 80
      hostPort: 80
--
        port: 10254
        scheme: HTTP
      initialDelaySeconds: 10
--
      sysctl -w net.ipv4.ip_local_port_range="1024 65535"
      sysctl -w kernel.core_uses_pid=0
      fi
[root@k8s-master1 ~]# kubectl  get svc -n kube-system nginx-ingress-lb
NAME               TYPE        CLUSTER-IP     EXTERNAL-IP   PORT(S)                    AGE
nginx-ingress-lb   ClusterIP   10.101.79.97   <none>        80/TCP,443/TCP,10254/TCP   69d

看下metrics指标

root@guoguo-M5-Pro:/apps/k8s/prometheus# curl 172.17.20.114:10254/metrics | tail -5f
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  6024    0  6024    0     0  2609k      0 --:--:-- --:--:-- --:--:-- 2941k
# HELP promhttp_metric_handler_requests_total Total number of scrapes by HTTP status code.
# TYPE promhttp_metric_handler_requests_total counter
promhttp_metric_handler_requests_total{code="200"} 10
promhttp_metric_handler_requests_total{code="500"} 0
promhttp_metric_handler_requests_total{code="503"} 0
root@guoguo-M5-Pro:/apps/k8s/prometheus# curl 172.17.20.115:10254/metrics | tail -5f
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  6029    0  6029    0     0  3401k      0 --:--:-- --:--:-- --:--:-- 5887k
# HELP promhttp_metric_handler_requests_total Total number of scrapes by HTTP status code.
# TYPE promhttp_metric_handler_requests_total counter
promhttp_metric_handler_requests_total{code="200"} 1
promhttp_metric_handler_requests_total{code="500"} 0
promhttp_metric_handler_requests_total{code="503"} 0

可以看到数据

创建个ServiceMonitor

ServiceMonitor 是Prometheus Operator提供的一种自定义资源(Custom Resource, CR),用于定义Prometheus监控服务发现的目标。它允许用户指定哪些Kubernetes服务(Service)和Pod的监控数据应该被Prometheus抓取,以及抓取数据的频率、路径等配置。

root@guoguo-M5-Pro:/apps/k8s/prometheus# vim ingress-nginx-prometheus.yaml
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: ingress-nginx-monitor  #监控名称 也就是prometheus ui页面显示
  namespace: ingress-nginx
  labels:
    app: ingress-nginx
spec:
  selector:   #这个标签要匹配到 被监控监控的标签
    matchLabels:
      app: ingress-nginx
  endpoints:
  - port: metrics  #目标服务上暴露指标的端口名称。
    path: /metrics  #目标服务上暴露指标的 HTTP 路径
    interval: 30s   #监控频率 每隔30s抓取一次
      #  jobLabel: app   #指定应该使用哪个标签来将目标分组为 Prometheus 中的一个作业。在这里,使用 app 标签来标识和分组共享此标签的所有目标为同一作业。
  namespaceSelector: #指定 ServiceMonitor 应监控哪些命名空间。
    matchNames:
    - ingress-nginx  #仅选择 ingress-nginx 命名空间
root@guoguo-M5-Pro:/apps/k8s/prometheus/servicemonitor# kubectl get svc  -n kube-system nginx-ingress-lb -oyaml
.....
.....
  ports:
  - name: http
    port: 80
    protocol: TCP
    targetPort: 80
  - name: https
    port: 443
    protocol: TCP
    targetPort: 443
  - name: metrics    #这里就是给这个端口号取个名字 上面的ServiceMonitor.spce.port 引用的
    port: 10254
    protocol: TCP
    targetPort: 10254
  selector:
    app: ingress-nginx
  sessionAffinity: None
  type: ClusterIP
status:
  loadBalancer: {}

新创建的prometheus 都会遇到一个权限的报错

现在promehtues ui 页面是看不到 监控项的

查看报错

root@guoguo-M5-Pro:/apps/k8s/prometheus/servicemonitor# kubectl -n monitoring logs prometheus-k8s-0 -c prometheus 
......
......
level=error ts=2024-08-25T07:07:41.107Z caller=klog.go:96 component=k8s_client_runtime func=ErrorDepth msg="pkg/mod/k8s.io/client-go@v0.20.5/tools/cache/reflector.go:167: Failed to watch *v1.Endpoints: failed to list *v1.Endpoints: endpoints is forbidden: User \"system:serviceaccount:monitoring:prometheus-k8s\" cannot list resource \"endpoints\" in API group \"\" in the namespace \"ingress-nginx\""

# 当看到forbidden 就是权限问题
root@guoguo-M5-Pro:/apps/k8s/prometheus/servicemonitor# kubectl -n monitoring logs prometheus-k8s-1 -c prometheus 
......
......
level=error ts=2024-08-25T07:09:33.891Z caller=klog.go:96 component=k8s_client_runtime func=ErrorDepth msg="pkg/mod/k8s.io/client-go@v0.20.5/tools/cache/reflector.go:167: Failed to watch *v1.Endpoints: failed to list *v1.Endpoints: endpoints is forbidden: User \"system:serviceaccount:monitoring:prometheus-k8s\" cannot list resource \"endpoints\" in API group \"\" in the namespace \"ingress-nginx\""

去修改prometheus 的集群角色clusterrole

root@guoguo-M5-Pro:/apps/k8s/prometheus/servicemonitor# kubectl edit clusterrole prometheus-k8s
......
......  #rules 部分改为下面  权限
rules:
- apiGroups:
  - ""
  resources:
  - nodes
  - services
  - endpoints
  - pods
  - nodes/proxy
  verbs:
  - get
  - list
  - watch
- apiGroups:
  - ""
  resources:
  - configmaps
  - nodes/metrics
  verbs:
  - get
- nonResourceURLs:
  - /metrics
  verbs:
  - get

登录prometheus ui网站 就可以看到了

serviceMonitor/ingress-nginx/ingress-nginx-monitor/0 (2/2 up)

yaml配置文件也修改下

vim kube-prometheus/manifests/prometheus-clusterRole.yaml

改为

apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  labels:
    app.kubernetes.io/component: prometheus
    app.kubernetes.io/name: prometheus
    app.kubernetes.io/part-of: kube-prometheus
    app.kubernetes.io/version: 2.26.0
  name: prometheus-k8s
rules:
- apiGroups:
  - ""
  resources:
  - nodes
  - services
  - endpoints
  - pods
  - nodes/proxy
  verbs:
  - get
  - list
  - watch
- apiGroups:
  - ""
  resources:
  - configmaps
  - nodes/metrics
  verbs:
  - get
- nonResourceURLs:
  - /metrics
  verbs:
  - get

这样就完成了