代码复制过来可能有问题, 详情看码云仓库: https://gitee.com/love-docker/k8s/tree/master/v1.11/monitor/%E5%91%8A%E8%AD%A6/ #(1)修改promethues的configmap文件, 添加告警规则

1)修改prometheus的configmap, 配置告警规则

  rule_files:
		- /etc/prometheus/rules.yml
	rules.yml: |+
		groups:
		- name: noah_pod.rules
			rules:
			- alert: PodMemUsage
				expr: container_memory_usage_bytes{container_name!=""} / container_spec_memory_limit_bytes{container_name!=""}  *100 != +Inf > 60
				for: 2m
				labels:
					severity: warning
				annotations:
					summary: "{{$labels.name}}: Pod High Mem usage detected"
					description: "{{$labels.name}}: Pod Mem is above 80% ,(current value is: {{ $value }})"
			- alert: PodCpuUsage
				expr: sum by (pod_name)( rate(container_cpu_usage_seconds_total{image!=""}[1m] ) ) * 100 > 70
				for: 2m
				labels:
					severity: warning
				annotations:
					summary: "{{$labels.name}}: Pod High CPU usage detected"
					description: "{{$labels.name}}: Pod CPU is above 80% ,(current value is: {{ $value }})"
			- alert: NetI/O_RX
				expr: sort_desc(sum by (kubernetes_pod_name) (rate (container_network_receive_bytes_total{name!=""}[1m]) )) /1024 /1024 /60 * 8 > 500
				for: 2m
				labels:
					severity: warning
				annotations:
					summary: "{{$labels.name}}: Pod High NetI/O_RX detected"
					description: "{{$labels.name}}: Pod NetI/O_RX is more than 500Mbps ,(current value is: {{ $value }})"       
			- alert: NetI/O_TX
				expr: sort_desc(sum by (kubernetes_pod_name) (rate (container_network_transmit_bytes_total{name!=""}[1m]) )) /1024 /1024 /60 * 8 > 500
				for: 2m
				labels:
					severity: warning
				annotations:
					summary: "{{$labels.name}}: Pod High NetI/O_TX detected"
					description: "{{$labels.name}}: Pod NetI/O_TX is more than 100Mbps ,(current value is: {{ $value }})"      

2)指定alertmanagers地址信息

		alerting:
			alertmanagers:
				- static_configs:
					- targets: ["alertmanager:9093"]

3)完整的configmap配置如下

	apiVersion: v1
kind: ConfigMap
metadata:
	name: prometheus-config
	namespace: kube-system
data:
	prometheus.yml: |
		global:
			scrape_interval:     15s
			evaluation_interval: 15s
		rule_files:
		- /etc/prometheus/rules.yml
		alerting:
			alertmanagers:
				- static_configs:
					#- targets: ["alertmanager:9093"]
					- targets: ["192.168.1.82:9093"]
					#- targets: ["192.168.1.51:9093"]
		scrape_configs:

		- job_name: 'kubernetes-apiservers'
			kubernetes_sd_configs:
			- role: endpoints
			scheme: https
			tls_config:
				ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
			bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
			relabel_configs:
			- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
				action: keep
				regex: default;kubernetes;https

		- job_name: 'kubernetes-nodes'
			kubernetes_sd_configs:
			- role: node
			scheme: https
			tls_config:
				ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
			bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
			relabel_configs:
			- action: labelmap
				regex: __meta_kubernetes_node_label_(.+)
			- target_label: __address__
				replacement: kubernetes.default.svc:443
			- source_labels: [__meta_kubernetes_node_name]
				regex: (.+)
				target_label: __metrics_path__
				replacement: /api/v1/nodes/${1}/proxy/metrics

		- job_name: 'kubernetes-cadvisor'
			kubernetes_sd_configs:
			- role: node
			scheme: https
			tls_config:
				ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
			bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
			relabel_configs:
			- action: labelmap
				regex: __meta_kubernetes_node_label_(.+)
			- target_label: __address__
				replacement: kubernetes.default.svc:443
			- source_labels: [__meta_kubernetes_node_name]
				regex: (.+)
				target_label: __metrics_path__
				replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor

		- job_name: 'kubernetes-service-endpoints'
			kubernetes_sd_configs:
			- role: endpoints
			relabel_configs:
			- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
				action: keep
				regex: true
			- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
				action: replace
				target_label: __scheme__
				regex: (https?)
			- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
				action: replace
				target_label: __metrics_path__
				regex: (.+)
			- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
				action: replace
				target_label: __address__
				regex: ([^:]+)(?::\d+)?;(\d+)
				replacement: $1:$2
			- action: labelmap
				regex: __meta_kubernetes_service_label_(.+)
			- source_labels: [__meta_kubernetes_namespace]
				action: replace
				target_label: kubernetes_namespace
			- source_labels: [__meta_kubernetes_service_name]
				action: replace
				target_label: kubernetes_name

		- job_name: 'kubernetes-services'
			kubernetes_sd_configs:
			- role: service
			metrics_path: /probe
			params:
				module: [http_2xx]
			relabel_configs:
			- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
				action: keep
				regex: true
			- source_labels: [__address__]
				target_label: __param_target
			- target_label: __address__
				replacement: blackbox-exporter.example.com:9115
			- source_labels: [__param_target]
				target_label: instance
			- action: labelmap
				regex: __meta_kubernetes_service_label_(.+)
			- source_labels: [__meta_kubernetes_namespace]
				target_label: kubernetes_namespace
			- source_labels: [__meta_kubernetes_service_name]
				target_label: kubernetes_name

		- job_name: 'kubernetes-ingresses'
			kubernetes_sd_configs:
			- role: ingress
			relabel_configs:
			- source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe]
				action: keep
				regex: true
			- source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path]
				regex: (.+);(.+);(.+)
				replacement: ${1}://${2}${3}
				target_label: __param_target
			- target_label: __address__
				replacement: blackbox-exporter.example.com:9115
			- source_labels: [__param_target]
				target_label: instance
			- action: labelmap
				regex: __meta_kubernetes_ingress_label_(.+)
			- source_labels: [__meta_kubernetes_namespace]
				target_label: kubernetes_namespace
			- source_labels: [__meta_kubernetes_ingress_name]
				target_label: kubernetes_name

		- job_name: 'kubernetes-pods'
			kubernetes_sd_configs:
			- role: pod
			relabel_configs:
			- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
				action: keep
				regex: true
			- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
				action: replace
				target_label: __metrics_path__
				regex: (.+)
			- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
				action: replace
				regex: ([^:]+)(?::\d+)?;(\d+)
				replacement: $1:$2
				target_label: __address__
			- action: labelmap
				regex: __meta_kubernetes_pod_label_(.+)
			- source_labels: [__meta_kubernetes_namespace]
				action: replace
				target_label: kubernetes_namespace
			- source_labels: [__meta_kubernetes_pod_name]
				action: replace
				target_label: kubernetes_pod_name


	rules.yml: |+
		groups:
		- name: noah_pod.rules
			rules:
			- alert: PodMemUsage
				expr: container_memory_usage_bytes{container_name!=""} / container_spec_memory_limit_bytes{container_name!=""}  *100 != +Inf > 60
				for: 2m
				labels:
					severity: warning
				annotations:
					summary: "{{$labels.name}}: Pod High Mem usage detected"
					description: "{{$labels.name}}: Pod Mem is above 80% ,(current value is: {{ $value }})"
			- alert: PodCpuUsage
				expr: sum by (pod_name)( rate(container_cpu_usage_seconds_total{image!=""}[1m] ) ) * 100 > 70
				for: 2m
				labels:
					severity: warning
				annotations:
					summary: "{{$labels.name}}: Pod High CPU usage detected"
					description: "{{$labels.name}}: Pod CPU is above 80% ,(current value is: {{ $value }})"
			- alert: NetI/O_RX
				expr: sort_desc(sum by (kubernetes_pod_name) (rate (container_network_receive_bytes_total{name!=""}[1m]) )) /1024 /1024 /60 * 8 > 500
				for: 2m
				labels:
					severity: warning
				annotations:
					summary: "{{$labels.name}}: Pod High NetI/O_RX detected"
					description: "{{$labels.name}}: Pod NetI/O_RX is more than 500Mbps ,(current value is: {{ $value }})"       
			- alert: NetI/O_TX
				expr: sort_desc(sum by (kubernetes_pod_name) (rate (container_network_transmit_bytes_total{name!=""}[1m]) )) /1024 /1024 /60 * 8 > 500
				for: 2m
				labels:
					severity: warning
				annotations:
					summary: "{{$labels.name}}: Pod High NetI/O_TX detected"
					description: "{{$labels.name}}: Pod NetI/O_TX is more than 100Mbps ,(current value is: {{ $value }})" 

4)登录prometheus后台验证

#(2)创建altermanger组件, 用于发送告警

1)创建告警模板文件

# cat alertmanager-templates.yaml 
apiVersion: v1
kind: ConfigMap
metadata:
	name: alertmanager-templates
	namespace: kube-system
data:
	wechat.tmpl: |
		{{ define "wechat.default.message" }}
		{{ range .Alerts }}
		========start==========
		告警程序:prometheus_alert
		告警级别:{{ .Labels.severity }}
		告警类型:{{ .Labels.alertname }}
		故障主机: {{ .Labels.instance }}
		告警主题: {{ .Annotations.summary }}
		告警详情: {{ .Annotations.description }}
		触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
		========end==========
		{{ end }}
		{{ end }}

2)创建配置文件, 注意需要提前注册好微信企业公众号

# cat configmap.yaml 
kind: ConfigMap
apiVersion: v1
metadata:
	name: alertmanager
	namespace: kube-system
data:
	config.yml: |-
		global:
			resolve_timeout: 5m 

		templates:     
		- '/etc/alertmanager-templates/wechat.tmpl'

		route:
			group_by: ['alertname']
			group_wait: 10s       
			group_interval: 10s  
			repeat_interval: 1h 
			receiver: 'wechat' 
		receivers:
		- name: 'wechat'
			wechat_configs:
			- corp_id: ''
				to_party: ''
				agent_id: ''
				api_secret: ''
				send_resolved: true 

3)创建deployment资源

# cat deployment.yaml 
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
	name: alertmanager
	namespace: kube-system
spec:
	replicas: 1
	selector:
		matchLabels:
			app: alertmanager
	template:
		metadata:
			name: alertmanager
			labels:
				app: alertmanager
		spec:
			containers:
			- name: alertmanager
				image: registry.cn-hangzhou.aliyuncs.com/wangfang-k8s/prometheus-alertmanager:v0.15.2
				args:
					- '--config.file=/etc/alertmanager/config.yml'
					- '--storage.path=/alertmanager'
				ports:
				- name: alertmanager
					containerPort: 9093
				volumeMounts:
				- name: config-volume
					mountPath: /etc/alertmanager
				- name: templates-volume
					mountPath: /etc/alertmanager-templates
				- name: alertmanager
					mountPath: /alertmanager
			serviceAccountName: prometheus
			volumes:
			- name: config-volume
				configMap:
					name: alertmanager
			- name: templates-volume
				configMap:
					name: alertmanager-templates
			- name: alertmanager
				emptyDir: {}

4)创建service

# cat service.yaml 
apiVersion: v1
kind: Service
metadata:
	annotations:
		prometheus.io/scrape: 'true'
		prometheus.io/path: '/metrics'
	labels:
		name: alertmanager
	name: alertmanager
	namespace: kube-system
spec:
	selector:
		app: alertmanager
	type: NodePort
	ports:
	- name: alertmanager
		protocol: TCP
		port: 9093
		targetPort: 9093
		nodePort: 39093

5)登录到altermanger后台

收到了prometheus发送过来的告警

6)微信公众号收到了报警信息

7)演示告警 创建一个pod, 使用压测工具对其不停的压测, 使其cpu使用率过高;

kubectl run hpa-test --image=registry.cn-hangzhou.aliyuncs.com/wangfang-k8s/hpa-example:latest --requests=cpu=200m --expose --port=80 
kubectl run -i --tty load-generator --image=busybox /bin/sh
#while true; do wget -q -O- http://hpa-test.default.svc.cluster.local; done

参考文档:https://blog.csdn.net/ywq935/article/details/80818982