文章目录

一、健康检查(服务探针)

对线上业务来说,保证服务的正常稳定是重中之重,对故障服务的及时处理避免影响业务以及快速恢复一直是开发运维的难点。Kubernetes提供了健康检查服务,对于检测到故障服务会被及时自动下线,以及通过重启服务的方式使服务自动恢复。

1、检查pod中容器是否能够正常启动

2、检查pod中容器是否能够正常对外提供服务

1、怎样保证pod中的容器正常启动?
2、怎样保证pod中容器能够正常对外提供服务?
3、只有容器启动了并且能够正常对外提供服务了,才能放到负载均衡上供给用户访问
4、Kubernetes提供了健康检查服务,对于检测到故障服务会被及时自动下线,以及通过重启服务的方式使服务自动恢复。

1、存活性探测 (LivenessProbe)

1、用于判断容器是否存活

2、处理的方式:如果判断失败,则重启POD

# 1、pod中所有容器的status=Running时,Pod的状态才会是Running状态。
# 2、判断容器是否存活,即Pod是否为running状态,如果LivenessProbe探针探测到容器不健康,则kubelet将kill掉容器,并根据容器的重启策略判断按照那种方式重启,如果一个容器不包含LivenessProbe探针,则Kubelet认为容器的LivenessProbe探针的返回值永远成功。
# 3、当存活性检查检测失败的时候,kebulet会删除容器,重新启动一个新的容器,继续检查。

存活性探测支持的方法有三种:ExecAction,TCPSocketAction,HTTPGetAction。

1、存活性探测实例
# 1.ExecAction
[root@k8s-m-01 k8s]# vim livenessProbe.yaml
kind: Deployment
apiVersion: apps/v1
metadata:
name: test-deployment
spec:
replicas: 1
selector:
matchLabels:
app: deployment
template:
metadata:
labels:
app: deployment
spec:
containers:
- name: nginx
image: alvinos/django:v1
livenessProbe: #存活性检查
exec:
command:
- "/bin/sh"
- "-c"
- "cat /root/test/manage.py" #修改访问不存在的文件,即探测失败
---
kind: Service
apiVersion: v1
metadata:
name: test-svc
namespace: default
spec:
ports:
- port: 80
targetPort: 80
name: http
selector:
app: deployment
# 查看存活性探测成功
[root@k8s-m-01 k8s]# kubectl get pod
NAME READY STATUS RESTARTS AGE
test-deployment-cdc445dd8-vjfsh 1/1 Running 0 58s
# 查看存活性探测失败
[root@k8s-m-01 k8s]# kubectl get pod -w
NAME READY STATUS RESTARTS AGE
test-deployment-cdc445dd8-vjfsh 1/1 Running 1 76s

# 2.HTTPGetAction
[root@k8s-m-01 k8s]# vim livenessProbe.yaml
kind: Deployment
apiVersion: apps/v1
metadata:
name: test-deployment
spec:
replicas: 1
selector:
matchLabels:
app: deployment
template:
metadata:
labels:
app: deployment
spec:
containers:
- name: nginx
image: alvinos/django:v1
livenessProbe: #存活性检查
httpGet:
port: 80 #修改访问不存在的端口,即探测失败
path: /index
---
kind: Service
apiVersion: v1
metadata:
name: test-svc
namespace: default
spec:
ports:
- port: 80
targetPort: 80
name: http
selector:
app: deployment
# 查看存活性探测成功
[root@k8s-m-01 k8s]# kubectl get pod
NAME READY STATUS RESTARTS AGE
test-deployment-cdc445dd8-vjfsh 1/1 Running 0 58s
# 查看存活性探测失败
[root@k8s-m-01 k8s]# kubectl get pod -w
NAME READY STATUS RESTARTS AGE
test-deployment-cdc445dd8-vjfsh 1/1 Running 1 76s

# 3.TcpSocket 相当于 ping
[root@k8s-m-01 k8s]# vim livenessProbe.yaml
kind: Deployment
apiVersion: apps/v1
metadata:
name: test-deployment
spec:
replicas: 1
selector:
matchLabels:
app: deployment
template:
metadata:
labels:
app: deployment
spec:
containers:
- name: nginx
image: alvinos/django:v1
livenessProbe: #存活性检查
tcpSocket:
port: 80 #修改访问不存在的端口,即探测失败
---
kind: Service
apiVersion: v1
metadata:
name: test-svc
namespace: default
spec:
ports:
- port: 80
targetPort: 80
name: http
selector:
app: deployment
# 查看存活性探测成功
[root@k8s-m-01 k8s]# kubectl get pod
NAME READY STATUS RESTARTS AGE
test-deployment-cdc445dd8-vjfsh 1/1 Running 0 58s
# 查看存活性探测失败
[root@k8s-m-01 k8s]# kubectl get pod -w
NAME READY STATUS RESTARTS AGE
test-deployment-cdc445dd8-vjfsh 1/1 Running 1 76s

2.健康检查参数 (检查探测参数)

# 1.检查失败最少次数,默认:3次
delay=10s : 探测延时时间initialDelaySeconds
timeout=1s :探测的超时时间
period=10s :探测的频率
success=1 :成功多少次才算成功
failure=3 :失败多少次才算失败

failureThreshold: 最少连续几次探测失败的次数,满足该次数则认为fail
initialDelaySeconds:容器启动之后开始进行存活性探测的秒数。不填立即进行
periodSeconds: 执行探测的频率(秒)。默认为10秒。最小值为1。
successThreshold: 探测失败后,最少连续探测成功多少次才被认定为成功,满足该次数则认为success。(但是如果是liveness则必须是 1。最小值是 1。)
timeoutSeconds: 每次执行探测的超时时间,默认1秒,最小1秒。

3、就绪性探测

1、用于判断容器是否正常提供服务

2、处理方式:探测失败,下线负载均衡(endprints —> NotReadyAddresses)

3、存活性检查和就绪性检查是否可以同时存在呢?可以

# 如果ReadinessProbe探测失败,则容器的Ready将设置为False,控制器将此Pod的Endpoint从对应的service的Endpoint列表中移除,从此不再将任何请求调度此Pod上,直到下次探测成功。
1、就绪性探测案例
# 1、编写就绪性探测文件
[root@k8s-m-01 k8s]# vim livenessProbe.yaml
kind: Deployment
apiVersion: apps/v1
metadata:
name: test-deployment
spec:
replicas: 1
selector:
matchLabels:
app: deployment
template:
metadata:
labels:
app: deployment
spec:
containers:
- name: nginx
image: alvinos/django:v1
livenessProbe:
exec:
command:
- "/bin/sh"
- "-c"
- "cat /root/test/manage.py"
readinessProbe: #就绪性探测
tcpSocket:
port: 80 #修改访问不存在的端口,即探测失败
# 2、查看svc
[root@k8s-m-01 k8s]# kubectl describe svc test-svc
Name: test-svc
Namespace: default
Labels: <none>
Annotations: <none>
Selector: app=deployment
Type: ClusterIP
IP Family Policy: SingleStack
IP Families: IPv4
IP: 10.97.199.150
IPs: 10.97.199.150
Port: http 80/TCP
TargetPort: 80/TCP
Endpoints: 10.244.1.68:80,10.244.1.69:80,10.244.2.66:80
Session Affinity: None
Events: <none>
2、就绪、探测性结合案例
# 1、编写配置清单
[root@k8s-m-01 k8s]# vim livenessProbe.yaml
kind: Deployment
apiVersion: apps/v1
metadata:
name: test-deployment
spec:
replicas: 1
selector:
matchLabels:
app: deployment
template:
metadata:
labels:
app: deployment
spec:
containers:
- name: nginx
image: alvinos/django:v1
livenessProbe:
exec:
command:
- "/bin/sh"
- "-c"
- "cat /root/test/manage.py"
initialDelaySeconds: 0
periodSeconds: 3 #可能网络原因,不起来,所以要探测三次
timeoutSeconds: 1
successThreshold: 1
failureThreshold: 3
readinessProbe: #就绪性探测
tcpSocket:
port: 80 #修改访问不存在的端口,即探测失败
initialDelaySeconds: 30 # jave大项目60s
periodSeconds: 1 # 如果失败,立即踢出负载均衡,为了怕影响的用户体验,所以必须设置成1s
timeoutSeconds: 1
successThreshold: 3
failureThreshold: 1
# 2、查看pod
[root@k8s-m-01 k8s]# kubectl get pod
NAME READY STATUS RESTARTS AGE
test-deployment-5d765fb67d-xksgc 1/1 Running 0 48s
# 3、具体查看健康检查的信息
[root@k8s-m-01 k8s]# kubectl describe pod test-deployment-5d765fb67d-xksgc
...
Restart Count: 0
Liveness: exec [/bin/sh -c cat /root/test/manage.py] delay=0s timeout=1s period=3s #success=1 #failure=3
Readiness: tcp-socket :80 delay=30s timeout=1s period=1s #success=3 #failure=1
# 4、查看负载均衡
[root@k8s-m-01 k8s]# while true;do curl 10.97.199.150/index;sleep 1;echo;done
主机名:test-deployment-5d765fb67d-xksgc,版本:v1
主机名:test-deployment-5d765fb67d-48slj,版本:v1
主机名:test-deployment-5d765fb67d-729r9,版本:v1
主机名:test-deployment-5d765fb67d-xksgc,版本:v1
主机名:test-deployment-5d765fb67d-48slj,版本:v1
主机名:test-deployment-5d765fb67d-729r9,版本:v1
# 5、修改版本与sevice类型
[root@k8s-m-01 k8s]# kubectl edit deployments.apps test-deployment
- image: alvinos/django:v2
replicas: 1
[root@k8s-m-01 k8s]# kubectl edit svc test-svc
type: NodePort
# 6、重新查看结果与svc
[root@k8s-m-01 k8s]# while true;do curl 10.97.199.150/index;sleep 1;echo;done
主机名:test-deployment-5d765fb67d-729r9,版本:v1
主机名:test-deployment-d75444fb5-gpwk4,版本:v2
主机名:test-deployment-d75444fb5-gpwk4,版本:v2
[root@k8s-m-01 k8s]# kubectl get svc
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
kubernetes ClusterIP 10.96.0.1 <none> 443/TCP 8d
test-svc NodePort 10.97.199.150 <none> 80:31881/TCP 54m
# 7、IP访问
192.168.15.111:31881
# 注 虽然版本和副本数改变了,但是没有报错

linux12k8s -->11健康检查和回调钩子_django

4.总结

1、存活性检查 : 容器是否正常启动,探测失败,立即删除容器
2、就绪性检查 : 容器是否能够正常提供服务,探测失败,立即移除负载均衡

# 存活性检查使用exec,就绪性检查使用tcpSocket,httpGet

二、回调HOOK

Pod Hook是由kubelet发起的,当容器中的进程启动前或者容器中的进程终止之前运行,这是包含在容器的生命周期之中。
  • PostStart:这个钩子在容器创建后立即执行。但是,并不能保证钩子将在容器ENTRYPOINT之前运行,因为没有参数传递给处理程序。主要用于资源部署、环境准备等。不过需要注意的是如果钩子花费太长时间以至于不能运行或者挂起, 容器将不能达到running状态。
  • PreStop:这个钩子在容器终止之前立即被调用。它是阻塞的,意味着它是同步的, 所以它必须在删除容器的调用发出之前完成。主要用于优雅关闭应用程序、通知其他系统等。如果钩子在执行期间挂起, Pod阶段将停留在running状态并且永不会达到failed状态。
如果PostStart或者PreStop钩子失败,它会杀死容器。
所以我们应该让钩子函数尽可能的轻量。当然有些情况下,长时间运行命令是合理的, 比如在停止容器之前预先保存状态 。

1、回调钩子

1、实例一
# 1、编写yaml文件
[root@k8s-m-01 k8s]# vim livenessProbe.yaml
kind: Deployment
apiVersion: apps/v1
metadata:
name: test-deployment
spec:
replicas: 1
selector:
matchLabels:
app: deployment
template:
metadata:
labels:
app: deployment
spec:
containers:
- name: nginx
image: alvinos/django:v1
lifecycle:
postStart:
exec:
command:
- "/bin/sh"
- "-c"
- "touch /root/1.txt"
preStop:
exec:
command:
- "/bin/sh"
- "-c"
- "echo '123' > /root/1.txt"
livenessProbe:
exec:
command:
- "/bin/sh"
- "-c"
- "cat /root/test/manage.py"
initialDelaySeconds: 0
periodSeconds: 3 #可能网络原因,不起来,所以要探测三次
timeoutSeconds: 1
successThreshold: 1
failureThreshold: 3
readinessProbe: #就绪性探测
tcpSocket:
port: 80 #修改访问不存在的端口,即探测失败
initialDelaySeconds: 30 # jave大项目60s
periodSeconds: 1 # 如果失败,立即踢出负载均衡,为了怕影响的用户体验,所以必须设置成1s
timeoutSeconds: 1
successThreshold: 3
failureThreshold: 1
# 2、查看pod
[root@k8s-m-01 k8s]# kubectl get pod -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
test-deployment-9db95f48d-p7c5d 1/1 Running 0 40s 10.244.2.70 k8s-n-02 <none> <none>
# 3、进入容器查看
[root@k8s-m-01 k8s]# kubectl exec -it test-deployment-9db95f48d-p7c5d -- bash
[root@test-deployment-9db95f48d-p7c5d test]# cd /root/
[root@test-deployment-9db95f48d-p7c5d ~]# ls
1.txt anaconda-ks.cfg test #默认创建1.txt
# 4、退出
[root@k8s-m-01 k8s]# kubectl delete -f livenessProbe.yaml
deployment.apps "test-deployment" deleted
[root@test-deployment-9db95f48d-p7c5d ~]# ls
1.txt anaconda-ks.cfg test # 默认在1.txt打印123
2、数据卷存储挂载 (案例二)
# 1.poststart:启动回调钩子,是在容器启动之后立即执行
apiVersion: apps/v1
kind: Deployment
metadata:
name: lifecycle
spec:
selector:
matchLabels:
app: cycle
template:
metadata:
labels:
app: cycle
spec:
nodeName: gdx2
containers:
- name: nginx
image: nginx
volumeMounts: #定义存储卷
- mountPath: /usr/share/nginx/html #容器内挂载路径
name: lifecycle-data
lifecycle: #生命周期
postStart: #启动回调钩子
exec:
command: #执行命令
- "/bin/bash"
- "-c"
- "echo 'This is Lifecycle' > /usr/share/nginx/html/index.html"
preStop: #结束回调钩子
exec:
command: #执行命令
- "/bin/bash"
- "-c"
- "echo 'This is Lifecycle preStop' > /usr/share/nginx/html/index.html"
volumes: #定义宿主机存储卷
- name: lifecycle-data
hostPath: #存储卷方式
path: /opt/discuz/data #宿主机挂载路径

# 2.验证启动回调钩子
[root@k8s-master1 ~]# kubectl get pods -w -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
lifecycle-94b467df6-2grpm 1/1 Running 0 10m 10.244.1.35 gdx2 <none> <none>
[root@k8s-master1 ~]# curl 10.244.1.35
This is Lifecycle

# 3.验证结束回调钩子
[root@k8s-master1 ~]# kubectl delete deployments.apps lifecycle
deployment.apps "lifecycle" deleted
[root@k8s-node1 discuz]# cat data/index.html
This is Lifecycle preStop