一、k8s上部署Redis集群

  • 本文采用nfs作为k8s动态存储
  • nfs环境可参考官网demo

1.1、部署sc

apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
  name: infra-nfs-redis
provisioner: kubernetes.io/no-provisioner
volumeBindingMode: WaitForFirstConsumer

1.2、创建redis配置文件,redis 配置文件使用 configmap 方式进行挂载

  • redis-cluster-cm.yml
  • fix-ip.sh 脚本的作用用于当 redis 集群某 pod 重建后 Pod IP 发生变化,在 /data/nodes.conf 中将新的 Pod IP 替换原 Pod IP。不然集群会出问题
apiVersion: v1
kind: ConfigMap
metadata:
  name: redis-cluster
  namespace: infra
data:
  fix-ip.sh: |
    #!/bin/sh
    CLUSTER_CONFIG="/data/nodes.conf"
    if [ -f ${CLUSTER_CONFIG} ]; then
      if [ -z "${POD_IP}" ]; then
        echo "Unable to determine Pod IP address!"
        exit 1
      fi
      echo "Updating my IP to ${POD_IP} in ${CLUSTER_CONFIG}"
      sed -i.bak -e '/myself/ s/[0-9]\{1,3\}\.[0-9]\{1,3\}\.[0-9]\{1,3\}\.[0-9]\{1,3\}/'${POD_IP}'/' ${CLUSTER_CONFIG}
    fi
    exec "$@"
  redis.conf: |
    bind 0.0.0.0
    protected-mode yes
    port 6379
    tcp-backlog 2048
    timeout 0
    tcp-keepalive 300
    daemonize no
    supervised no
    pidfile /var/run/redis.pid
    loglevel notice
    logfile /data/redis.log
    databases 16
    always-show-logo yes
    stop-writes-on-bgsave-error yes
    rdbcompression yes
    rdbchecksum yes
    dbfilename dump.rdb
    dir /data
    masterauth liuchang@2022
    replica-serve-stale-data yes
    replica-read-only no
    repl-diskless-sync no
    repl-diskless-sync-delay 5
    repl-disable-tcp-nodelay no
    replica-priority 100
    requirepass 123456
    maxclients 32768
    #maxmemory 6g
    maxmemory-policy allkeys-lru
    lazyfree-lazy-eviction no
    lazyfree-lazy-expire no
    lazyfree-lazy-server-del no
    replica-lazy-flush no
    appendonly yes
    appendfilename "appendonly.aof"
    appendfsync everysec
    no-appendfsync-on-rewrite no
    auto-aof-rewrite-percentage 100
    auto-aof-rewrite-min-size 64mb
    aof-load-truncated yes
    aof-use-rdb-preamble yes
    lua-time-limit 5000
    cluster-enabled yes
    cluster-config-file /data/nodes.conf
    cluster-node-timeout 15000
    slowlog-log-slower-than 10000
    slowlog-max-len 128
    latency-monitor-threshold 0
    notify-keyspace-events ""
    hash-max-ziplist-entries 512
    hash-max-ziplist-value 64
    list-max-ziplist-size -2
    list-compress-depth 0
    set-max-intset-entries 512
    zset-max-ziplist-entries 128
    zset-max-ziplist-value 64
    hll-sparse-max-bytes 3000
    stream-node-max-bytes 4096
    stream-node-max-entries 100
    activerehashing yes
    client-output-buffer-limit normal 0 0 0
    client-output-buffer-limit replica 256mb 64mb 60
    client-output-buffer-limit pubsub 32mb 8mb 60
    hz 10
    dynamic-hz yes
    aof-rewrite-incremental-fsync yes
    rdb-save-incremental-fsync yes

1.3、redis-cluster-sts.yml

  • 也是服用zk的标签,调度到node01、node02、mater03上
apiVersion: apps/v1
kind: StatefulSet
metadata:
  namespace: infra
  name: redis-cluster
spec:
  serviceName: redis-cluster
  replicas: 6
  selector:
    matchLabels:
      app: redis-cluster
  template:
    metadata:
      labels:
        app: redis-cluster
    spec:
      affinity:
        nodeAffinity:                                      # node亲和性
          requiredDuringSchedulingIgnoredDuringExecution:  # 硬策略,调度在app.kubernetes.io/component=zookeeper的节点中
            nodeSelectorTerms:
            - matchExpressions:
              - key: app.kubernetes.io/component
                operator: In
                values:
                  - zookeeper
        podAntiAffinity:                                    # Pod反亲和性
          preferredDuringSchedulingIgnoredDuringExecution:  # 软策略,使Pod分布在不同的节点上
          - weight: 1                                       # 权重,有多个策略通过权重控制调度
            podAffinityTerm:
              topologyKey: app.kubernetes.io/name           # 通过app.kubernetes.io/name作为域调度  
              labelSelector:
                matchExpressions:
                - key: app.kubernetes.io/component
                  operator: In
                  values:
                  - zookeeper
      containers:
      - name: redis
        image: redis:5.0.13
        ports:
        - containerPort: 6379
          name: client
        - containerPort: 16379
          name: gossip
        command: ["/etc/redis/fix-ip.sh", "redis-server", "/etc/redis/redis.conf"]
        env:
        - name: POD_IP
          valueFrom:
            fieldRef:
              fieldPath: status.podIP
        volumeMounts:
        - name: conf
          mountPath: /etc/redis/
          readOnly: false
        - name: redis-data
          mountPath: /data
          readOnly: false
      volumes:
      - name: conf
        configMap:
          name: redis-cluster
          defaultMode: 0755
  volumeClaimTemplates:
  - metadata:
      name: redis-data
    spec:
      storageClassName: infra-nfs-redis
      accessModes:
        - ReadWriteMany
      resources:
        requests:
          storage: 5Gi

1.4、创建svc

apiVersion: v1
kind: Service
metadata:
  namespace: infra
  name: redis-cluster
spec:
  clusterIP: None
  ports:
  - port: 6379
    targetPort: 6379
    name: client
  - port: 16379
    targetPort: 16379
    name: gossip
  selector:
    app: redis-cluster

1.5、初始化集群

  • 注意: 必须使用 ip 进行初始化 redis 集群,使用域名会报如下错误
Node redis-cluster-1.redis-cluster.redis-cluster.svc.cluster.local:6379 replied with error:
ERR Invalid node address specified: redis-cluster-0.redis-cluster.redis-cluster.svc.cluster.local:6379
  • 获取 Redis 集群 6 个节点 Pod 的 ip 地址
  • 应用连接 redis 集群时使用 pod 的域名
  • svc名字.ns名字.svc.cluster.local
  • nslookup redis-cluster.infra.svc.cluster.local
# 方式一:
[root@k8s-master01 集群]# kubectl get po -n infra  -owide
NAME                                      READY   STATUS    RESTARTS   AGE     IP              NODE           NOMINATED NODE   READINESS GATES
nfs-client-provisioner-6b57f44cb6-vrhwx   1/1     Running   1          19h     10.244.195.63   k8s-master03   <none>           <none>
redis-cluster-0                           1/1     Running   0          8m26s   10.244.58.221   k8s-node02     <none>           <none>
redis-cluster-1                           1/1     Running   0          7m59s   10.244.195.13   k8s-master03   <none>           <none>
redis-cluster-2                           1/1     Running   0          7m49s   10.244.85.214   k8s-node01     <none>           <none>
redis-cluster-3                           1/1     Running   0          7m23s   10.244.58.222   k8s-node02     <none>           <none>
redis-cluster-4                           1/1     Running   0          7m19s   10.244.195.11   k8s-master03   <none>           <none>
redis-cluster-5                           1/1     Running   0          7m15s   10.244.85.212   k8s-node01     <none>           <none>

# 方式二:
kubectl run -i --tty --image busybox:1.28.4 dns-test --restart=Never --rm /bin/sh
# 应用连接 redis 集群时使用下面 pod 的域名
nslookup redis-cluster.redis-cluster.svc.cluster.local 

svc名字.ns名字.svc.cluster.local
/ # nslookup redis-cluster.infra.svc.cluster.local 
Server:    10.96.0.10
Address 1: 10.96.0.10 kube-dns.kube-system.svc.cluster.local
Name:      redis-cluster.infra.svc.cluster.local
Address 1: 10.244.85.212 redis-cluster-5.redis-cluster.infra.svc.cluster.local
Address 2: 10.244.85.214 redis-cluster-2.redis-cluster.infra.svc.cluster.local
Address 3: 10.244.195.11 redis-cluster-4.redis-cluster.infra.svc.cluster.local
Address 4: 10.244.58.221 redis-cluster-0.redis-cluster.infra.svc.cluster.local
Address 5: 10.244.58.222 redis-cluster-3.redis-cluster.infra.svc.cluster.local
Address 6: 10.244.195.13 redis-cluster-1.redis-cluster.infra.svc.cluster.local
  • 创建集群
  • 替换成6个pod IP
[root@k8s-master01 集群]# kubectl exec -it pod/redis-cluster-0 -n infra -- bash
root@redis-cluster-0:/data# redis-cli -a 密码 --cluster create \
> 10.244.85.212:6379 \
> 10.244.85.214:6379 \
> 10.244.195.11:6379 \
> 10.244.58.221:6379 \
> 10.244.58.222:6379 \
> 10.244.195.13:6379 \
> --cluster-replicas 1
Warning: Using a password with '-a' or '-u' option on the command line interface may not be safe.
>>> Performing hash slots allocation on 6 nodes...
Master[0] -> Slots 0 - 5460
Master[1] -> Slots 5461 - 10922
Master[2] -> Slots 10923 - 16383
Adding replica 10.244.58.222:6379 to 10.244.85.212:6379
Adding replica 10.244.195.13:6379 to 10.244.85.214:6379
Adding replica 10.244.58.221:6379 to 10.244.195.11:6379
M: b18a9738c0f9b080c99563cc629e9d739408bc2e 10.244.85.212:6379
   slots:[0-5460] (5461 slots) master
M: dcfe4e84eb6d56c369fda3cea013e247f87f3a80 10.244.85.214:6379
   slots:[5461-10922] (5462 slots) master
M: 623e7b8734784b15d58f560e9224da8653f28789 10.244.195.11:6379
   slots:[10923-16383] (5461 slots) master
S: d5e437118b5dfadcf8884e8f71260afb580e8720 10.244.58.221:6379
   replicates 623e7b8734784b15d58f560e9224da8653f28789
S: 5392f77757ddc9b6459fc2c2ecc0f1e9adaebfb7 10.244.58.222:6379
   replicates b18a9738c0f9b080c99563cc629e9d739408bc2e
S: 4ec83ffa582159f54630be7e95033badd3f04579 10.244.195.13:6379
   replicates dcfe4e84eb6d56c369fda3cea013e247f87f3a80
Can I set the above configuration? (type 'yes' to accept): yes       # 此处输入yes
>>> Nodes configuration updated
>>> Assign a different config epoch to each node
>>> Sending CLUSTER MEET messages to join the cluster
Waiting for the cluster to join
.
>>> Performing Cluster Check (using node 10.244.85.212:6379)
M: b18a9738c0f9b080c99563cc629e9d739408bc2e 10.244.85.212:6379
   slots:[0-5460] (5461 slots) master
   1 additional replica(s)
M: dcfe4e84eb6d56c369fda3cea013e247f87f3a80 10.244.85.214:6379
   slots:[5461-10922] (5462 slots) master
   1 additional replica(s)
S: 4ec83ffa582159f54630be7e95033badd3f04579 10.244.195.13:6379
   slots: (0 slots) slave
   replicates dcfe4e84eb6d56c369fda3cea013e247f87f3a80
M: 623e7b8734784b15d58f560e9224da8653f28789 10.244.195.11:6379
   slots:[10923-16383] (5461 slots) master
   1 additional replica(s)
S: d5e437118b5dfadcf8884e8f71260afb580e8720 10.244.58.221:6379
   slots: (0 slots) slave
   replicates 623e7b8734784b15d58f560e9224da8653f28789
S: 5392f77757ddc9b6459fc2c2ecc0f1e9adaebfb7 10.244.58.222:6379
   slots: (0 slots) slave
   replicates b18a9738c0f9b080c99563cc629e9d739408bc2e
[OK] All nodes agree about slots configuration.
>>> Check for open slots...
>>> Check slots coverage...
[OK] All 16384 slots covered.

1.6、验证 Redis Cluster 集群

  • 看到以下信息说明集群部署成功
[root@k8s-master01 集群]# kubectl exec -it pod/redis-cluster-0 -n infra -- bash
root@redis-cluster-0:/data# redis-cli -h redis-cluster-1.redis-cluster.infra.svc.cluster.local -c -a '密码'

redis-cluster-1.redis-cluster.infra.svc.cluster.local:6379> cluster info
cluster_state:ok
cluster_slots_assigned:16384
cluster_slots_ok:16384
cluster_slots_pfail:0
cluster_slots_fail:0
cluster_known_nodes:6
cluster_size:3
cluster_current_epoch:6
cluster_my_epoch:2
cluster_stats_messages_ping_sent:240
cluster_stats_messages_pong_sent:231
cluster_stats_messages_meet_sent:1
cluster_stats_messages_sent:472
cluster_stats_messages_ping_received:231
cluster_stats_messages_pong_received:241
cluster_stats_messages_received:472

redis-cluster-1.redis-cluster.infra.svc.cluster.local:6379> cluster nodes
4ec83ffa582159f54630be7e95033badd3f04579 10.244.195.13:6379@16379 myself,slave dcfe4e84eb6d56c369fda3cea013e247f87f3a80 0 1664877624000 6 connected
623e7b8734784b15d58f560e9224da8653f28789 10.244.195.11:6379@16379 master - 0 1664877625000 3 connected 10923-16383
b18a9738c0f9b080c99563cc629e9d739408bc2e 10.244.85.212:6379@16379 master - 0 1664877626150 1 connected 0-5460
dcfe4e84eb6d56c369fda3cea013e247f87f3a80 10.244.85.214:6379@16379 master - 0 1664877624144 2 connected 5461-10922
5392f77757ddc9b6459fc2c2ecc0f1e9adaebfb7 10.244.58.222:6379@16379 slave b18a9738c0f9b080c99563cc629e9d739408bc2e 0 1664877626000 5 connected
d5e437118b5dfadcf8884e8f71260afb580e8720 10.244.58.221:6379@16379 slave 623e7b8734784b15d58f560e9224da8653f28789 0 1664877627157 4 connected

1.7、故障测试 【***】

  • 删除任意一个 pod(删除名称为 redis-cluster-3 的 pod)
kubectl describe po -n infra redis-cluster-3
  • pod 被重新拉起(还占用原来的pvc 和 pv)
[root@k8s-master01 集群]# kubectl get  po -n infra redis-cluster-3 -owide
NAME              READY   STATUS    RESTARTS   AGE   IP              NODE         NOMINATED NODE   READINESS GATES
redis-cluster-3   1/1     Running   0          7s    10.244.58.224   k8s-node02   <none>           <none>
# 可以看到名称为 redis-cluster-3 的 pod 启动时长 AGE 为 7s,IP 由原来的 10.244.58.222 变为 10.244.58.224,
/data/nodes.conf 文件中 "myself" 对应的 ip 被 fix-ip.sh 脚本修改,redis 集群修复时会将该 ip 同步到其它 pod
节的 /data/nodes.conf 文件中,从而保证整个 redis 集群的可用性。

# kubectl exec -it pod/redis-cluster-0 -n infra -- bash
root@redis-cluster-0:/data# cat /data/nodes.conf   | grep 10.244.58.224
5392f77757ddc9b6459fc2c2ecc0f1e9adaebfb7 10.244.58.224:6379@16379 slave b18a9738c0f9b080c99563cc629e9d739408bc2e 1664877856604 1664877855096 5 disconnected
  • 再次验证集群
root@redis-cluster-0:/data# redis-cli -h redis-cluster-1.redis-cluster.infra.svc.cluster.local -c -a 'mima'
root@redis-cluster-0:/data# [root@k8s-master01 集群]# kubectl exec -it pod/redis-cluster-0 -n infra -- bash
redis-cluster-1.redis-cluster.infra.svc.cluster.local:6379> cluster info
cluster_state:ok
cluster_slots_assigned:16384
cluster_slots_ok:16384
cluster_slots_pfail:0
cluster_slots_fail:0
cluster_known_nodes:6
cluster_size:3
cluster_current_epoch:6
cluster_my_epoch:2
cluster_stats_messages_ping_sent:803
cluster_stats_messages_pong_sent:741
cluster_stats_messages_meet_sent:1
cluster_stats_messages_sent:1545
cluster_stats_messages_ping_received:741
cluster_stats_messages_pong_received:802
cluster_stats_messages_received:1543

1.8、补充

  • 如果整个 redis 集群的 pod 全部都挂掉了,pod自动拉起后,集群不可用,需要重建集群
重建集群的方法一: 重新开始【肯定不建议这样啊】
  • 删除 redis 集群所有的资源,然后重新创建 redis 集群
kubectl delete -f redis-cluster-sts.yml
  • 删除 redis 集群中所有的 pvc(pv)
kubectl delete pvc/data-redis-cluster-0 -n infra 
kubectl delete pvc/data-redis-cluster-1 -n infra
kubectl delete pvc/data-redis-cluster-2 -n infra
kubectl delete pvc/data-redis-cluster-3 -n infra
kubectl delete pvc/data-redis-cluster-4 -n infra
kubectl delete pvc/data-redis-cluster-5 -n infra
  • 删除 redis 集群中 pod 对应的 nfs 持久化存储目录
rm -rf 对应目录
  • 重新创建 redis 集群
  • 然后重新创建集群
kubectl apply -f redis-cluster-sts.yml
重建集群的方法二: 在原有 redis 集群的基础上进行修复
  • 删除 redis 集群中所有的 pod
kubectl delete -f redis-cluster-sts.yml
  • 找到 redis 集群中 pod 对应的 nfs 持久化存储目录后删除 nodes.conf
[root@k8s-node02 infra_data]# cd /admin/infra_data/
# 删了这些,conf.bak 是重启过pod的备份
[root@k8s-node02 infra_data]# ls infra-redis-data-redis-cluster-*/nodes.conf*
infra-redis-data-redis-cluster-0-pvc-353cde34-443d-444b-8f63-a7b466f6e0b8/nodes.conf
infra-redis-data-redis-cluster-1-pvc-20b64aaf-2234-41cb-92f0-357df89ab05a/nodes.conf
infra-redis-data-redis-cluster-2-pvc-8a0607fb-e20c-44f3-858b-0efd5d8574c6/nodes.conf
infra-redis-data-redis-cluster-3-pvc-f12779be-f0dc-4058-aa61-21941de461fa/nodes.conf
infra-redis-data-redis-cluster-3-pvc-f12779be-f0dc-4058-aa61-21941de461fa/nodes.conf.bak
infra-redis-data-redis-cluster-4-pvc-883db134-fc4e-46e7-8f6f-555aefa085c0/nodes.conf
infra-redis-data-redis-cluster-5-pvc-a8335203-725c-4101-b7f9-c83b86e4cbda/nodes.conf
  • 重新创建 redis 集群
kubectl apply -f redis-cluster-sts.yml