Ceph安装

【vm-01】准备工作

mkdir -p /home/ceph && mkdir -p /var/lib/rook

cd /home/ceph && git clone --single-branch --branch v1.10.8 https://github.com/rook/rook.git

cd /home/ceph/rook/deploy/examples

001 - 10分钟完成一套Ceph部署 - 附脚本_sed


for i in `cat images.txt | grep csi- ` ; do docker pull registry.aliyuncs.com/google_containers/${i##*/} ; done


for i in `docker images | grep csi- | awk -F ' ' '{print $1":"$2}' ` ; do docker tag registry.aliyuncs.com/google_containers/${i##*/} registry.k8s.io/sig-storage/${i##*/} ; done

docker images | grep registry.aliyuncs.com/google_containers/csi- | awk -F ' ' '{print $1":"$2}' | xargs docker rmi

001 - 10分钟完成一套Ceph部署 - 附脚本_3c_02


for i in `cat images.txt | grep -E "quay.io|rook" ` ; do docker pull $i ; done

001 - 10分钟完成一套Ceph部署 - 附脚本_ceph_03


【vm-01】部署crds,common,operator

kubectl create -f /home/ceph/rook/deploy/examples/crds.yaml -f /home/ceph/rook/deploy/examples/common.yaml -f /home/ceph/rook/deploy/examples/operator.yaml

##回滚:

## kubectl delete -f /home/ceph/rook/deploy/examples/crds.yaml -f /home/ceph/rook/deploy/examples/common.yaml -f /home/ceph/rook/deploy/examples/operator.yaml

001 - 10分钟完成一套Ceph部署 - 附脚本_docker_04


mkdir -p /home/k8s/ceph/images && cd /home/k8s/ceph/images

for i in `docker images | grep -E "csi-|rook|quay.io" | awk -F ' ' '{print $1":"$2}'` ; do docker save -o /home/k8s/ceph/images/${i##*/}.tar $i ; done

001 - 10分钟完成一套Ceph部署 - 附脚本_docker_05


cd /home/k8s/ceph/images/ && for i in `ls | grep -E "ceph*|csi-|k8s-" ` ; do ctr -n=k8s.io image import $i ; done

001 - 10分钟完成一套Ceph部署 - 附脚本_3c_06


for i in `docker images | grep -E "csi-|rook|quay.io" | awk -F ' ' '{print $1":"$2}'` ; do docker tag $i 192.168.100.7:5000/${i##*/} ; done

for i in `docker images | grep 192.168.100 | awk -F ' ' '{print $1":"$2}'`; do docker push $i; done

001 - 10分钟完成一套Ceph部署 - 附脚本_docker_07


cp -r /home/ceph/rook/deploy/examples /home/ceph/rook/deploy/examples.bak

【vm-01】vm-01的镜像同步到vm-02,vm-03

for i in {2..3} ; do ssh vm-0$i exec mkdir -p /home/k8s/ceph/images/ && scp -r /home/k8s/ceph/images/* vm-0$i:/home/k8s/ceph/images/ ; done

001 - 10分钟完成一套Ceph部署 - 附脚本_sed_08


for i in {2..3} ; do ssh vm-0$i 'cd /home/k8s/ceph/images/ && for j in `ls *.tar` ; do ctr -n=k8s.io image import $j; done ' ; done

001 - 10分钟完成一套Ceph部署 - 附脚本_3c_09


【vm-01】node打标签

应该可以不执行

kubectl label -n rook-ceph nodes {vm-01,vm-02,vm-03} ceph-osd=enabled

kubectl label -n rook-ceph nodes {vm-01,vm-02,vm-03} ceph-mnotallow=enabled

kubectl label -n rook-ceph nodes vm-01 ceph-mgr=enabled


【vm-01】部署cluster

grep -Ev '^$|#' /home/ceph/rook/deploy/examples/cluster.yaml

001 - 10分钟完成一套Ceph部署 - 附脚本_sed_10


cd /home/ceph/rook/deploy/examples/

sed -i 's/# ROOK_CSI_REGISTRAR_IMAGE: \"/ROOK_CSI_REGISTRAR_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator-openshift.yaml

sed -i 's/# ROOK_CSI_REGISTRAR_IMAGE: \"/ROOK_CSI_REGISTRAR_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator.yaml


sed -i 's/# ROOK_CSI_RESIZER_IMAGE: \"/ROOK_CSI_RESIZER_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator-openshift.yaml

sed -i 's/# ROOK_CSI_RESIZER_IMAGE: \"/ROOK_CSI_RESIZER_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator.yaml


sed -i 's/# ROOK_CSI_PROVISIONER_IMAGE: \"/ROOK_CSI_PROVISIONER_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator-openshift.yaml

sed -i 's/# ROOK_CSI_PROVISIONER_IMAGE: \"/ROOK_CSI_PROVISIONER_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator.yaml


sed -i 's/# ROOK_CSI_SNAPSHOTTER_IMAGE: \"/ROOK_CSI_SNAPSHOTTER_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator-openshift.yaml

sed -i 's/# ROOK_CSI_SNAPSHOTTER_IMAGE: \"/ROOK_CSI_SNAPSHOTTER_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator.yaml


sed -i 's/# ROOK_CSI_ATTACHER_IMAGE: \"/ROOK_CSI_ATTACHER_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator-openshift.yaml

sed -i 's/# ROOK_CSI_ATTACHER_IMAGE: \"/ROOK_CSI_ATTACHER_IMAGE: \"/g' /home/ceph/rook/deploy/examples/operator.yaml


sed -i 's/registry.k8s.io\/sig-storage/192.168.100.7:5000/g' /home/ceph/rook/deploy/examples/operator-openshift.yaml

sed -i 's/registry.k8s.io\/sig-storage/192.168.100.7:5000/g' /home/ceph/rook/deploy/examples/operator.yaml

001 - 10分钟完成一套Ceph部署 - 附脚本_sed_11



##约第92行的provider前的#要去掉,否则不能在k8s集群外访问ceph。

sed -i 's/#provider: host/provider: host/g' /home/ceph/rook/deploy/examples/cluster.yaml

sed -i 's/useAllNodes: true/useAllNodes: false/g' /home/ceph/rook/deploy/examples/cluster.yaml

sed -i 's/useAllDevices: true/useAllDevices: false/g' /home/ceph/rook/deploy/examples/cluster.yaml

sed -i 's/# osdsPerDevice: "1"/osdsPerDevice: "1"/g' /home/ceph/rook/deploy/examples/cluster.yaml

vi /home/ceph/rook/deploy/examples/cluster.yaml

nodes:

- name: "vm-01"

devices:

- name: "vdb"

- name: "vm-02"

devices:

- name: "vdb"

- name: "vm-03"

devices:

- name: "vdb"

001 - 10分钟完成一套Ceph部署 - 附脚本_sed_12


注:以上配置中vdb应为未分区的磁盘。

001 - 10分钟完成一套Ceph部署 - 附脚本_3c_13


取消污点:

kubectl describe nodes vm-01 | grep Taints

kubectl taint nodes vm-01 node-role.kubernetes.io/control-plane:NoSchedule-

001 - 10分钟完成一套Ceph部署 - 附脚本_ceph_14


kubectl apply -f /home/ceph/rook/deploy/examples/cluster.yaml

## 拉取镜像失败 http: server gave HTTP response to HTTPS client,则增加以下配置的3-4行配置后重启containerd即可:

## 注:3台机器都需要修改

sed -i 's/config_path = \"\"/config_path = \"\"\n [plugins.\"io.containerd.grpc.v1.cri\".registry.mirrors.\"192.168.100.7:5000\"]\n endpoint = [\"http:\/\/192.168.100.7:5000\"]/g' /etc/containerd/config.toml

for i in {1..3}; do ssh vm-0$i "systemctl daemon-reload && systemctl restart docker && systemctl restart containerd && systemctl restart kubelet" ; done


## 拉取镜像失败:Back-off pulling image "registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.5.1"

## vm-01成功,但是vm-02和vm-03失败

001 - 10分钟完成一套Ceph部署 - 附脚本_3c_15


因为vm-02,和vm-03的ctr -n k8s.io images list |grep csi-snapshotter确实没有记录。

Vm-01:

001 - 10分钟完成一套Ceph部署 - 附脚本_3c_16


Vm-02:

001 - 10分钟完成一套Ceph部署 - 附脚本_sed_17


重复执行1.5.3即可。


【vm-01】部署toolbox,dashboard-external-https

kubectl create -f /home/ceph/rook/deploy/examples/toolbox.yaml

kubectl apply -f /home/ceph/rook/deploy/examples/dashboard-external-https.yaml

kubectl -n rook-ceph get secret rook-ceph-dashboard-password -o jsnotallow="{['data']['password']}" | base64 -d

001 - 10分钟完成一套Ceph部署 - 附脚本_3c_18


Admin登录密码:,iBa`<LvX=<3X%]:)|=b


【vm-01】ceph查询

kubectl exec -it `kubectl get pods -n rook-ceph|grep rook-ceph-tools|awk '{print $1}'` -n rook-ceph -- bash

001 - 10分钟完成一套Ceph部署 - 附脚本_docker_19


## Dashboard端口

kubectl -n rook-ceph get svc | grep rook-ceph-mgr-dashboard-external-https

iptables -S -t nat | grep 31518

001 - 10分钟完成一套Ceph部署 - 附脚本_ceph_20



此时可将31518端口映射到互联网访问:vi /opt/frp_0.53.0_linux_amd64/frpc.ini

[jjjj-ceph-31518]

#配置类型为http协议

type = tcp

#内网机器的IP

local_ip = 127.0.0.1

#内网需要监听的端口(win10所启服务端口)

local_port = 31518

remote_port = 23888

use_encryption = true

# if true, message will be compressed

use_compression = true


for i in `ps -aux | grep frp | awk -F ' ' '{print $2}'` ; do kill -9 $i ; done


http://zzjjftp.v6.navy:23888/

查询登录密码:

kubectl -n rook-ceph get secret rook-ceph-dashboard-password -o jsnotallow="{['data']['password']}" | base64 -d


## 对象存储端口

kubectl -n rook-ceph get svc | grep rook-ceph-rgw-my-store-external

## 查看状态及 max_mds 和 standby_count_wanted 的值

ceph fs get kingcephfs

001 - 10分钟完成一套Ceph部署 - 附脚本_ceph_21


## 查看pool的pg 数量

ceph osd pool get replicapool pg_num

001 - 10分钟完成一套Ceph部署 - 附脚本_docker_22


## 设置pool的pg 数量。重新修改 pg 数,会对生产环境产生较大影响。因为 pg 数变了,就会导致整个集群的数据重新均衡和迁移,数据越大响应 io 的时间会越长。所以,最好在一开始就设置好 pg 数

ceph osd pool set replicapool pg_num 40

001 - 10分钟完成一套Ceph部署 - 附脚本_3c_23



【vm-01】RBD块测试

### 创建一个名为replicapool的rbd pool

kubectl apply -f /home/ceph/rook/deploy/examples/csi/rbd/storageclass.yaml

kubectl -n rook-ceph get storageclass.storage.k8s.io


## 部署 WordPress

kubectl apply -f /home/ceph/rook/deploy/examples/mysql.yaml

kubectl apply -f /home/ceph/rook/deploy/examples/wordpress.yaml

【vm-01】CephFS文件存储测试

kubectl apply -f /home/ceph/rook/deploy/examples/csi/cephfs/storageclass.yaml

kubectl apply -f /home/ceph/rook/deploy/examples/filesystem.yaml

kubectl get cephfilesystems.ceph.rook.io -n rook-ceph

001 - 10分钟完成一套Ceph部署 - 附脚本_docker_24


kubectl -n rook-ceph exec -it $(kubectl -n rook-ceph get pod -l "app=rook-ceph-tools" -o jsnotallow='{.items[0].metadata.name}') bash

ceph osd pool create cephfs-king-metadata 32 32

ceph osd pool create cephfs-king-data 32 32

ceph fs new kingcephfs cephfs-king-metadata cephfs-king-data

001 - 10分钟完成一套Ceph部署 - 附脚本_ceph_25


此时有报错:1 filesystem is offline; 1 filesystem is online with fewer MDS than max_mds

将默认的myfs删掉即可:

ceph fs fail myfs

ceph fs rm myfs --yes-i-really-mean-it

ceph fs fail kingcephfs

ceph fs rm kingcephfs --yes-i-really-mean-it

ceph fs new kingcephfs cephfs-king-metadata cephfs-king-data --force

001 - 10分钟完成一套Ceph部署 - 附脚本_3c_26


## 生成key

ceph auth add client.king mon 'allow r' mds 'allow rw' osd 'allow rwx pool=kingcephfs'

ceph auth get client.king

001 - 10分钟完成一套Ceph部署 - 附脚本_3c_27


##查看netstat -tunlp | grep ceph-mon端口,根据客户端版本使用3300或6789端口。

001 - 10分钟完成一套Ceph部署 - 附脚本_sed_28


## 设置mds数量

ceph fs kingcephfs max_mds 2


【vm-04】CephFS客户端

mkdir -p /mnt/mycephfs && yum install telnet net-tools ceph-common -y

mount -t ceph 192.168.100.7:6789,192.168.100.8:6789,192.168.100.9:6789:/ /mnt/mycephfs -o name=king,secret=AQCx4xdmqUF8ABAA834Svi2EhRiMMAqDUXSkFQ==

## 此处可能mount能执行成功,但是在/mnt/mycephfs创建文本文件后无法编辑,是因为pool权限的问题,指定pool为data的fs即可,如下:

ceph auth caps client.king mon 'allow r' mds 'allow rw' osd 'allow rwx pool=cephfs-king-data'

001 - 10分钟完成一套Ceph部署 - 附脚本_sed_29


001 - 10分钟完成一套Ceph部署 - 附脚本_docker_30



【vm-01】RGW对象存储测试

kubectl create -f /home/ceph/rook/deploy/examples/object.yaml

kubectl -n rook-ceph get pod -l app=rook-ceph-rgw

001 - 10分钟完成一套Ceph部署 - 附脚本_ceph_31


##创建对象存储 user

kubectl create -f /home/ceph/rook/deploy/examples/object-user.yaml

## 部署 rgw nodeport

kubectl apply -f /home/ceph/rook/deploy/examples/rgw-external.yaml

kubectl -n rook-ceph get service rook-ceph-rgw-my-store rook-ceph-rgw-my-store-external

001 - 10分钟完成一套Ceph部署 - 附脚本_3c_32



【vm-04】RGW客户端

yum install -y epel-release && yum install -y net-tools telnet s3cmd python-pip

001 - 10分钟完成一套Ceph部署 - 附脚本_docker_33


## 在dashboard上创建桶,比如bucket-king,owner选择:my-user

001 - 10分钟完成一套Ceph部署 - 附脚本_sed_34


## 对象存储端口

kubectl -n rook-ceph get svc | grep rook-ceph-rgw-my-store-external

001 - 10分钟完成一套Ceph部署 - 附脚本_ceph_35


将31964端口映射到23889

001 - 10分钟完成一套Ceph部署 - 附脚本_sed_36


## 获取ak/sk

kubectl -n rook-ceph get secret rook-ceph-object-user-my-store-my-user -o yaml | grep AccessKey | awk '{print $2}' | base64 --decode

kubectl -n rook-ceph get secret rook-ceph-object-user-my-store-my-user -o yaml | grep SecretKey | awk '{print $2}' | base64 --decode

001 - 10分钟完成一套Ceph部署 - 附脚本_docker_37


## s3cfg配置:

[default]

access_key = E9BJAC6QKLTOKVJR4TZC

secret_key = KBvm0YqeAFtVM0I1gUhX5wrnwpHZZ5SLhiUa56ss

host_base = zzjjftp.v6.navy:23889

host_bucket = zzjjftp.v6.navy:23889/bucket-king

use_https = False

signature_v2 = False

001 - 10分钟完成一套Ceph部署 - 附脚本_ceph_38



cat .s3cfg-inner-ip

[default]

access_key = E9BJAC6QKLTOKVJR4TZC

host_base = 192.168.100.8:80

host_bucket = 192.168.100.8:80/testkucket

secret_key = KBvm0YqeAFtVM0I1gUhX5wrnwpHZZ5SLhiUa56ss

signature_v2 = False

use_https = False

001 - 10分钟完成一套Ceph部署 - 附脚本_3c_39


001 - 10分钟完成一套Ceph部署 - 附脚本_3c_40

其中192.168.100.8和80端口来源于以上查询。


## s3命令:

s3cmd ls s3://bucket-king

s3cmd put frpc.ini s3://bucket-king/

s3cmd rm s3://bucket-king/frpc.ini

s3cmd get s3://bucket-king/frpc.ini

Ceph常见操作

【vm-01】 Pod删不掉

kubectl -nrook-ceph get deployment | grep mds | grep 0/1

kubectl -nrook-ceph delete deployment ? 此处为上面命令返回的id

【vm-01】 namespace删不掉

###删除ns:rook-ceph

kubectl delete namespace rook-ceph

kubectl get ns rook-ceph -o json > tmp.json

清空spec里的内容

kubectl proxy --port=8081

curl -k -H "Content-Type: application/json" -X PUT --data-binary @tmp.json http://127.0.0.1:8081/api/v1/namespaces/rook-ceph/finalize


【vm-01】 filesystem is down

## 报错处理1: MDS_ALL_DOWN: 1 filesystem is offline / fs kingcephfs is offline because no MDS is active for it.

--ctrl+D将mds的pod删掉重建即可。


## 报错处理2:MDS_INSUFFICIENT_STANDBY: insufficient standby MDS daemons available

查看具体错误信息ceph health detail

查看mds名称:ceph mds stat 输出kingcephfs

ceph fs set kingcephfs max_mds 3

ceph fs set kingcephfs standby_count_wanted 3

【vm-01】 ceph常用命令

## ceph操作命令

ceph status

ceph df

rados df

ceph osd status

ceph fs ls

ceph mds stat


## 重启dashboard

ceph mgr module disable dashboard

ceph mgr module enable dashboard


## 查看dashboard地址

ceph mgr services


##处理告警 HEALTH_WARN 1 daemons have recently crashed; 5 mgr modules have recently crashed

ceph crash ls

ceph crash archive-all

ceph crash info xxx


ceph mgr module ls


【vm-01】 overall HEALTH_WARN insufficient standby MDS daemons available

sed -i "s/activeCount: 1/activeCount: 2/g" /home/ceph/rook/deploy/examples/filesystem.yaml

kubectl apply -f /home/ceph/rook/deploy/examples/filesystem.yaml

001 - 10分钟完成一套Ceph部署 - 附脚本_ceph_41


ceph fs set kingcephfs standby_count_wanted 0


001 - 10分钟完成一套Ceph部署 - 附脚本_sed_42




【vm-01】 页面创建桶报错:RGW REST API cannot be reached: Connection refused

kubectl -n rook-ceph exec -it $(kubectl -n rook-ceph get pod -l "app=rook-ceph-tools" -o jsnotallow='{.items[0].metadata.name}') bash

ceph mgr module disable dashboard

ceph mgr module enable dashboard

执行后即可


【vm-01】 rgw的NodePort无法telnet

001 - 10分钟完成一套Ceph部署 - 附脚本_sed_43


001 - 10分钟完成一套Ceph部署 - 附脚本_docker_44


for i in {1..3}; do ssh vm-0$i 'echo "net.ipv4.tcp_tw_recycle = 0" >> /etc/sysctl.conf && sysctl -p '; done

001 - 10分钟完成一套Ceph部署 - 附脚本_sed_45



参考文档:

https://blog.51cto.com/u_15181572/6172742

https://www.cnblogs.com/hahaha111122222/p/15716352.html