Ceph安装
【vm-01】准备工作
mkdir -p /home/ceph && mkdir -p /var/lib/rook
cd /home/ceph && git clone --single-branch --branch v1.10.8 https://github.com/rook/rook.git
cd /home/ceph/rook/deploy/examples
for i in `cat images.txt | grep csi- ` ; do docker pull registry.aliyuncs.com/google_containers/${i##*/} ; done
for i in `docker images | grep csi- | awk -F ' ' '{print $1":"$2}' ` ; do docker tag registry.aliyuncs.com/google_containers/${i##*/} registry.k8s.io/sig-storage/${i##*/} ; done
docker images | grep registry.aliyuncs.com/google_containers/csi- | awk -F ' ' '{print $1":"$2}' | xargs docker rmi
for i in `cat images.txt | grep -E "quay.io|rook" ` ; do docker pull $i ; done
【vm-01】部署crds,common,operator
kubectl create -f /home/ceph/rook/deploy/examples/crds.yaml -f /home/ceph/rook/deploy/examples/common.yaml -f /home/ceph/rook/deploy/examples/operator.yaml
##回滚:
## kubectl delete -f /home/ceph/rook/deploy/examples/crds.yaml -f /home/ceph/rook/deploy/examples/common.yaml -f /home/ceph/rook/deploy/examples/operator.yaml
mkdir -p /home/k8s/ceph/images && cd /home/k8s/ceph/images
for i in `docker images | grep -E "csi-|rook|quay.io" | awk -F ' ' '{print $1":"$2}'` ; do docker save -o /home/k8s/ceph/images/${i##*/}.tar $i ; done
cd /home/k8s/ceph/images/ && for i in `ls | grep -E "ceph*|csi-|k8s-" ` ; do ctr -n=k8s.io image import $i ; done
for i in `docker images | grep -E "csi-|rook|quay.io" | awk -F ' ' '{print $1":"$2}'` ; do docker tag $i 192.168.100.7:5000/${i##*/} ; done
for i in `docker images | grep 192.168.100 | awk -F ' ' '{print $1":"$2}'`; do docker push $i; done
cp -r /home/ceph/rook/deploy/examples /home/ceph/rook/deploy/examples.bak
【vm-01】vm-01的镜像同步到vm-02,vm-03
for i in {2..3} ; do ssh vm-0$i exec mkdir -p /home/k8s/ceph/images/ && scp -r /home/k8s/ceph/images/* vm-0$i:/home/k8s/ceph/images/ ; done
for i in {2..3} ; do ssh vm-0$i 'cd /home/k8s/ceph/images/ && for j in `ls *.tar` ; do ctr -n=k8s.io image import $j; done ' ; done
【vm-01】node打标签
应该可以不执行
kubectl label -n rook-ceph nodes {vm-01,vm-02,vm-03} ceph-osd=enabled
kubectl label -n rook-ceph nodes {vm-01,vm-02,vm-03} ceph-mnotallow=enabled
kubectl label -n rook-ceph nodes vm-01 ceph-mgr=enabled
【vm-01】部署cluster
grep -Ev '^$|#' /home/ceph/rook/deploy/examples/cluster.yaml
cd /home/ceph/rook/deploy/examples/
sed -i 's/# ROOK_CSI_REGISTRAR_IMAGE: \"/ROOK_CSI_REGISTRAR_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator-openshift.yaml
sed -i 's/# ROOK_CSI_REGISTRAR_IMAGE: \"/ROOK_CSI_REGISTRAR_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator.yaml
sed -i 's/# ROOK_CSI_RESIZER_IMAGE: \"/ROOK_CSI_RESIZER_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator-openshift.yaml
sed -i 's/# ROOK_CSI_RESIZER_IMAGE: \"/ROOK_CSI_RESIZER_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator.yaml
sed -i 's/# ROOK_CSI_PROVISIONER_IMAGE: \"/ROOK_CSI_PROVISIONER_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator-openshift.yaml
sed -i 's/# ROOK_CSI_PROVISIONER_IMAGE: \"/ROOK_CSI_PROVISIONER_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator.yaml
sed -i 's/# ROOK_CSI_SNAPSHOTTER_IMAGE: \"/ROOK_CSI_SNAPSHOTTER_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator-openshift.yaml
sed -i 's/# ROOK_CSI_SNAPSHOTTER_IMAGE: \"/ROOK_CSI_SNAPSHOTTER_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator.yaml
sed -i 's/# ROOK_CSI_ATTACHER_IMAGE: \"/ROOK_CSI_ATTACHER_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator-openshift.yaml
sed -i 's/# ROOK_CSI_ATTACHER_IMAGE: \"/ROOK_CSI_ATTACHER_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator.yaml
sed -i 's/registry.k8s.io\/sig-storage/192.168.100.7:5000/g' /home/ceph/rook/deploy/examples/operator-openshift.yaml
sed -i 's/registry.k8s.io\/sig-storage/192.168.100.7:5000/g' /home/ceph/rook/deploy/examples/operator.yaml
##约第92行的provider前的#要去掉,否则不能在k8s集群外访问ceph。
sed -i 's/#provider: host/provider: host/g' /home/ceph/rook/deploy/examples/cluster.yaml
sed -i 's/useAllNodes: true/useAllNodes: false/g' /home/ceph/rook/deploy/examples/cluster.yaml
sed -i 's/useAllDevices: true/useAllDevices: false/g' /home/ceph/rook/deploy/examples/cluster.yaml
sed -i 's/# osdsPerDevice: "1"/osdsPerDevice: "1"/g' /home/ceph/rook/deploy/examples/cluster.yaml
vi /home/ceph/rook/deploy/examples/cluster.yaml
nodes:
- name: "vm-01"
devices:
- name: "vdb"
- name: "vm-02"
devices:
- name: "vdb"
- name: "vm-03"
devices:
- name: "vdb"
注:以上配置中vdb应为未分区的磁盘。
取消污点:
kubectl describe nodes vm-01 | grep Taints
kubectl taint nodes vm-01 node-role.kubernetes.io/control-plane:NoSchedule-
kubectl apply -f /home/ceph/rook/deploy/examples/cluster.yaml
## 拉取镜像失败 http: server gave HTTP response to HTTPS client,则增加以下配置的3-4行配置后重启containerd即可:
## 注:3台机器都需要修改
sed -i 's/config_path = \"\"/config_path = \"\"\n [plugins.\"io.containerd.grpc.v1.cri\".registry.mirrors.\"192.168.100.7:5000\"]\n endpoint = [\"http:\/\/192.168.100.7:5000\"]/g' /etc/containerd/config.toml
for i in {1..3}; do ssh vm-0$i "systemctl daemon-reload && systemctl restart docker && systemctl restart containerd && systemctl restart kubelet" ; done
## 拉取镜像失败:Back-off pulling image "registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.5.1"
## vm-01成功,但是vm-02和vm-03失败
是因为vm-02,和vm-03的ctr -n k8s.io images list |grep csi-snapshotter确实没有记录。
Vm-01:
Vm-02:
重复执行1.5.3即可。
【vm-01】部署toolbox,dashboard-external-https
kubectl create -f /home/ceph/rook/deploy/examples/toolbox.yaml
kubectl apply -f /home/ceph/rook/deploy/examples/dashboard-external-https.yaml
kubectl -n rook-ceph get secret rook-ceph-dashboard-password -o jsnotallow="{['data']['password']}" | base64 -d
Admin登录密码:,iBa`<LvX=<3X%]:)|=b
【vm-01】ceph查询
kubectl exec -it `kubectl get pods -n rook-ceph|grep rook-ceph-tools|awk '{print $1}'` -n rook-ceph -- bash
## Dashboard端口
kubectl -n rook-ceph get svc | grep rook-ceph-mgr-dashboard-external-https
iptables -S -t nat | grep 31518
此时可将31518端口映射到互联网访问:vi /opt/frp_0.53.0_linux_amd64/frpc.ini
[jjjj-ceph-31518]
#配置类型为http协议
type = tcp
#内网机器的IP
local_ip = 127.0.0.1
#内网需要监听的端口(win10所启服务端口)
local_port = 31518
remote_port = 23888
use_encryption = true
# if true, message will be compressed
use_compression = true
for i in `ps -aux | grep frp | awk -F ' ' '{print $2}'` ; do kill -9 $i ; done
查询登录密码:
kubectl -n rook-ceph get secret rook-ceph-dashboard-password -o jsnotallow="{['data']['password']}" | base64 -d
## 对象存储端口
kubectl -n rook-ceph get svc | grep rook-ceph-rgw-my-store-external
## 查看状态及 max_mds 和 standby_count_wanted 的值
ceph fs get kingcephfs
## 查看pool的pg 数量
ceph osd pool get replicapool pg_num
## 设置pool的pg 数量。重新修改 pg 数,会对生产环境产生较大影响。因为 pg 数变了,就会导致整个集群的数据重新均衡和迁移,数据越大响应 io 的时间会越长。所以,最好在一开始就设置好 pg 数
ceph osd pool set replicapool pg_num 40
【vm-01】RBD块测试
### 创建一个名为replicapool的rbd pool
kubectl apply -f /home/ceph/rook/deploy/examples/csi/rbd/storageclass.yaml
kubectl -n rook-ceph get storageclass.storage.k8s.io
## 部署 WordPress
kubectl apply -f /home/ceph/rook/deploy/examples/mysql.yaml
kubectl apply -f /home/ceph/rook/deploy/examples/wordpress.yaml
【vm-01】CephFS文件存储测试
kubectl apply -f /home/ceph/rook/deploy/examples/csi/cephfs/storageclass.yaml
kubectl apply -f /home/ceph/rook/deploy/examples/filesystem.yaml
kubectl get cephfilesystems.ceph.rook.io -n rook-ceph
kubectl -n rook-ceph exec -it $(kubectl -n rook-ceph get pod -l "app=rook-ceph-tools" -o jsnotallow='{.items[0].metadata.name}') bash
ceph osd pool create cephfs-king-metadata 32 32
ceph osd pool create cephfs-king-data 32 32
ceph fs new kingcephfs cephfs-king-metadata cephfs-king-data
此时有报错:1 filesystem is offline; 1 filesystem is online with fewer MDS than max_mds
将默认的myfs删掉即可:
ceph fs fail myfs
ceph fs rm myfs --yes-i-really-mean-it
ceph fs fail kingcephfs
ceph fs rm kingcephfs --yes-i-really-mean-it
ceph fs new kingcephfs cephfs-king-metadata cephfs-king-data --force
## 生成key
ceph auth add client.king mon 'allow r' mds 'allow rw' osd 'allow rwx pool=kingcephfs'
ceph auth get client.king
##查看netstat -tunlp | grep ceph-mon端口,根据客户端版本使用3300或6789端口。
## 设置mds数量
ceph fs kingcephfs max_mds 2
【vm-04】CephFS客户端
mkdir -p /mnt/mycephfs && yum install telnet net-tools ceph-common -y
mount -t ceph 192.168.100.7:6789,192.168.100.8:6789,192.168.100.9:6789:/ /mnt/mycephfs -o name=king,secret=AQCx4xdmqUF8ABAA834Svi2EhRiMMAqDUXSkFQ==
## 此处可能mount能执行成功,但是在/mnt/mycephfs创建文本文件后无法编辑,是因为pool权限的问题,指定pool为data的fs即可,如下:
ceph auth caps client.king mon 'allow r' mds 'allow rw' osd 'allow rwx pool=cephfs-king-data'
【vm-01】RGW对象存储测试
kubectl create -f /home/ceph/rook/deploy/examples/object.yaml
kubectl -n rook-ceph get pod -l app=rook-ceph-rgw
##创建对象存储 user
kubectl create -f /home/ceph/rook/deploy/examples/object-user.yaml
## 部署 rgw nodeport
kubectl apply -f /home/ceph/rook/deploy/examples/rgw-external.yaml
kubectl -n rook-ceph get service rook-ceph-rgw-my-store rook-ceph-rgw-my-store-external
【vm-04】RGW客户端
yum install -y epel-release && yum install -y net-tools telnet s3cmd python-pip
## 在dashboard上创建桶,比如bucket-king,owner选择:my-user
## 对象存储端口
kubectl -n rook-ceph get svc | grep rook-ceph-rgw-my-store-external
将31964端口映射到23889
## 获取ak/sk
kubectl -n rook-ceph get secret rook-ceph-object-user-my-store-my-user -o yaml | grep AccessKey | awk '{print $2}' | base64 --decode
kubectl -n rook-ceph get secret rook-ceph-object-user-my-store-my-user -o yaml | grep SecretKey | awk '{print $2}' | base64 --decode
## s3cfg配置:
[default]
access_key = E9BJAC6QKLTOKVJR4TZC
secret_key = KBvm0YqeAFtVM0I1gUhX5wrnwpHZZ5SLhiUa56ss
host_base = zzjjftp.v6.navy:23889
host_bucket = zzjjftp.v6.navy:23889/bucket-king
use_https = False
signature_v2 = False
cat .s3cfg-inner-ip
[default]
access_key = E9BJAC6QKLTOKVJR4TZC
host_base = 192.168.100.8:80
host_bucket = 192.168.100.8:80/testkucket
secret_key = KBvm0YqeAFtVM0I1gUhX5wrnwpHZZ5SLhiUa56ss
signature_v2 = False
use_https = False
其中192.168.100.8和80端口来源于以上查询。
## s3命令:
s3cmd ls s3://bucket-king
s3cmd put frpc.ini s3://bucket-king/
s3cmd rm s3://bucket-king/frpc.ini
s3cmd get s3://bucket-king/frpc.ini
Ceph常见操作
【vm-01】 Pod删不掉
kubectl -nrook-ceph get deployment | grep mds | grep 0/1
kubectl -nrook-ceph delete deployment ? 此处为上面命令返回的id
【vm-01】 namespace删不掉
###删除ns:rook-ceph
kubectl delete namespace rook-ceph
kubectl get ns rook-ceph -o json > tmp.json
清空spec里的内容
kubectl proxy --port=8081
curl -k -H "Content-Type: application/json" -X PUT --data-binary @tmp.json http://127.0.0.1:8081/api/v1/namespaces/rook-ceph/finalize
【vm-01】 filesystem is down
## 报错处理1: MDS_ALL_DOWN: 1 filesystem is offline / fs kingcephfs is offline because no MDS is active for it.
--ctrl+D将mds的pod删掉重建即可。
## 报错处理2:MDS_INSUFFICIENT_STANDBY: insufficient standby MDS daemons available
查看具体错误信息ceph health detail
查看mds名称:ceph mds stat 输出kingcephfs
ceph fs set kingcephfs max_mds 3
ceph fs set kingcephfs standby_count_wanted 3
【vm-01】 ceph常用命令
## ceph操作命令
ceph status
ceph df
rados df
ceph osd status
ceph fs ls
ceph mds stat
## 重启dashboard
ceph mgr module disable dashboard
ceph mgr module enable dashboard
## 查看dashboard地址
ceph mgr services
##处理告警 HEALTH_WARN 1 daemons have recently crashed; 5 mgr modules have recently crashed
ceph crash ls
ceph crash archive-all
ceph crash info xxx
ceph mgr module ls
【vm-01】 overall HEALTH_WARN insufficient standby MDS daemons available
sed -i "s/activeCount: 1/activeCount: 2/g" /home/ceph/rook/deploy/examples/filesystem.yaml
kubectl apply -f /home/ceph/rook/deploy/examples/filesystem.yaml
ceph fs set kingcephfs standby_count_wanted 0
【vm-01】 页面创建桶报错:RGW REST API cannot be reached: Connection refused
kubectl -n rook-ceph exec -it $(kubectl -n rook-ceph get pod -l "app=rook-ceph-tools" -o jsnotallow='{.items[0].metadata.name}') bash
ceph mgr module disable dashboard
ceph mgr module enable dashboard
执行后即可
【vm-01】 rgw的NodePort无法telnet
for i in {1..3}; do ssh vm-0$i 'echo "net.ipv4.tcp_tw_recycle = 0" >> /etc/sysctl.conf && sysctl -p '; done
参考文档:
https://blog.51cto.com/u_15181572/6172742
https://www.cnblogs.com/hahaha111122222/p/15716352.html