1. Helm部署
1.1 简介
为了将Ceph部署到K8S集群中,可以利用ceph-helm项目。 目前此项目存在一些限制:
- public和cluster网络必须一样
- 如果Storage的用户不是admin,你需要在Ceph集群中手工创建用户,并在K8S中创建对应的Secrets
- ceph-mgr只能运行单副本
1.2 仓库
执行下面的命令把ceph-helm添加到本地Helm仓库:
# 此项目使用Helm本地仓库保存Chart,如果没有启动本地存储,请启动
nohup /usr/local/bin/helm serve --address 0.0.0.0:8879 > /dev/null 2>&1 &
git clone https://github.com/ceph/ceph-helm
pushd ceph-helm/ceph
make
popd
# 构建成功后Chart归档文件位于 ./ceph-0.1.0.tgz
1.3 覆盖值
可用值的说明如下:
ceph.yaml
# 部署哪些组件
deployment:
ceph: true
storage_secrets: true
client_secrets: true
rbd_provisioner: true
rgw_keystone_user_and_endpoints: false
# 修改这些值可以指定其它镜像
images:
ks_user: docker.io/kolla/ubuntu-source-heat-engine:3.0.3
ks_service: docker.io/kolla/ubuntu-source-heat-engine:3.0.3
ks_endpoints: docker.io/kolla/ubuntu-source-heat-engine:3.0.3
bootstrap: docker.io/ceph/daemon:tag-build-master-luminous-ubuntu-16.04
dep_check: docker.io/kolla/ubuntu-source-kubernetes-entrypoint:4.0.0
daemon: docker.io/ceph/daemon:tag-build-master-luminous-ubuntu-16.04
ceph_config_helper: docker.io/port/ceph-config-helper:v1.7.5
# 如果使用官方提供的StorageClass,你需要扩展kube-controller镜像,否则报executable file not found in $PATH
rbd_provisioner: quay.io/external_storage/rbd-provisioner:v0.1.1
minimal: docker.io/alpine:latest
pull_policy: "IfNotPresent"
# 不同Ceph组件使用什么节点选择器
labels:
jobs:
node_selector_key: ceph-mon
node_selector_value: enabled
mon:
node_selector_key: ceph-mon
node_selector_value: enabled
mds:
node_selector_key: ceph-mds
node_selector_value: enabled
osd:
node_selector_key: ceph-osd
node_selector_value: enabled
rgw:
node_selector_key: ceph-rgw
node_selector_value: enabled
mgr:
node_selector_key: ceph-mgr
node_selector_value: enabled
pod:
dns_policy: "ClusterFirstWithHostNet"
replicas:
rgw: 1
mon_check: 1
rbd_provisioner: 2
mgr: 1
affinity:
anti:
type:
default: preferredDuringSchedulingIgnoredDuringExecution
topologyKey:
default: kubernetes.io/hostname
# 如果集群资源匮乏,可以调整下面的资源请求
resources:
enabled: false
osd:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "1024Mi"
cpu: "1000m"
mds:
requests:
memory: "10Mi"
cpu: "100m"
limits:
memory: "50Mi"
cpu: "500m"
mon:
requests:
memory: "50Mi"
cpu: "100m"
limits:
memory: "100Mi"
cpu: "500m"
mon_check:
requests:
memory: "5Mi"
cpu: "100m"
limits:
memory: "50Mi"
cpu: "500m"
rgw:
requests:
memory: "5Mi"
cpu: "100m"
limits:
memory: "50Mi"
cpu: "500m"
rbd_provisioner:
requests:
memory: "5Mi"
cpu: "100m"
limits:
memory: "50Mi"
cpu: "500m"
mgr:
requests:
memory: "5Mi"
cpu: "100m"
limits:
memory: "50Mi"
cpu: "500m"
jobs:
bootstrap:
limits:
memory: "1024Mi"
cpu: "2000m"
requests:
memory: "128Mi"
cpu: "100m"
secret_provisioning:
limits:
memory: "1024Mi"
cpu: "2000m"
requests:
memory: "128Mi"
cpu: "100m"
ks_endpoints:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "1024Mi"
cpu: "2000m"
ks_service:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "1024Mi"
cpu: "2000m"
ks_user:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "1024Mi"
cpu: "2000m"
secrets:
keyrings:
mon: ceph-mon-keyring
mds: ceph-bootstrap-mds-keyring
osd: ceph-bootstrap-osd-keyring
rgw: ceph-bootstrap-rgw-keyring
mgr: ceph-bootstrap-mgr-keyring
admin: ceph-client-admin-keyring
identity:
admin: ceph-keystone-admin
user: ceph-keystone-user
user_rgw: ceph-keystone-user-rgw
# !! 根据实际情况网络配置
network:
public: 10.0.0.0/16
cluster: 10.0.0.0/16
port:
mon: 6789
rgw: 8088
# !! 在此添加需要的Ceph配置项
conf:
# 对象存储网关服务相关
rgw_ks:
config:
rgw_keystone_api_version: 3
rgw_keystone_accepted_roles: "admin, _member_"
rgw_keystone_implicit_tenants: true
rgw_s3_auth_use_keystone: true
ceph:
override:
append:
config:
global:
mon_host: null
osd:
ms_bind_port_max: 7100
ceph:
rgw_keystone_auth: false
enabled:
mds: true
rgw: true
mgr: true
storage:
# 基于目录的OSD,在宿主机上存储的路径
# /var/lib/ceph-helm/osd会挂载到容器的/var/lib/ceph/osd目录
osd_directory: /var/lib/ceph-helm
mon_directory: /var/lib/ceph-helm
# 将日志收集到/var/log,便于fluentd来采集
mon_log: /var/log/ceph/mon
osd_log: /var/log/ceph/osd
# !! 是否启用基于目录的OSD,需要配合节点标签ceph-osd=enabled
# 存储的位置由上面的storage.osd_directory确定,沿用现有的文件系统
osd_directory:
enabled: false
# 如果设置为1,则允许Ceph格式化磁盘,这会导致数据丢失
enable_zap_and_potentially_lose_data: true
# !! 基于块设备的OSD,需要配合节点标签ceph-osd-device-dev-***=enabled
osd_devices:
- name: dev-vdb
# 使用的块设备
device: /dev/vdb
# 日志可以存储到独立块设备上,提升性能,如果不指定,存放在device
journal: /dev/vdc
# 是否删除其分区表
zap: "1"
bootstrap:
enabled: false
script: |
ceph -s
function ensure_pool () {
ceph osd pool stats $1 || ceph osd pool create $1 $2
}
ensure_pool volumes 8
# 启用的mgr模块
ceph_mgr_enabled_modules:
- restful
- status
# 配置mgr模块
ceph_mgr_modules_config:
dashboard:
port: 7000
localpool:
failure_domain: host
subtree: rack
pg_num: "128"
num_rep: "3"
min_size: "2"
# 在部署/升级后,执行下面的命令
# 这些命令通过kubectl来执行
ceph_commands:
- ceph osd pool create pg_num
- ceph osd crush tunables
# Kubernetes 存储类配置
storageclass:
provision_storage_class: true
provisioner: ceph.com/rbd
# 存储类名称
name: ceph-rbd
monitors: nullcurshmap.src
# 使用的RBD存储池的名称
pool: rbd
admin_id: admin
admin_secret_name: pvc-ceph-conf-combined-storageclass
admin_secret_namespace: ceph
user_id: admin
user_secret_name: pvc-ceph-client-key
# RBD设备的镜像格式和特性
image_format: "2"
image_features: layering
endpoints:
# 集群域名后缀
cluster_domain_suffix: k8s.gmem.cc
identity:
name: keystone
namespace: null
auth:
admin:
region_name: RegionOne
username: admin
password: password
project_name: admin
user_domain_name: default
project_domain_name: default
user:
role: admin
region_name: RegionOne
username: swift
password: password
project_name: service
user_domain_name: default
project_domain_name: default
hosts:
default: keystone-api
public: keystone
host_fqdn_override:
default: null
path:
default: /v3
scheme:
default: http
port:
admin:
default: 35357
api:
default: 80
object_store:
name: swift
namespace: null
hosts:
default: ceph-rgw
host_fqdn_override:
default: null
path:
default: /swift/v1
scheme:
default: http
port:
api:
default: 8088
ceph_mon:
namespace: null
hosts:
default: ceph-mon
host_fqdn_override:
default: null
port:
mon:
default: 6789
Ext4文件系统上基于目录的OSD配置,覆盖值示例:
network:
public: 10.0.0.0/8
cluster: 10.0.0.0/8
conf:
ceph:
config:
global:
# Ext4文件系统
filestore_xattr_use_omap: true
osd:
ms_bind_port_max: 7100
# Ext4文件系统
osd_max_object_name_len: 256
osd_max_object_namespace_len: 64
osd_crush_update_on_start : false
ceph:
storage:
osd_directory: /var/lib/ceph-helm
mon_directory: /var/lib/ceph-helm
mon_log: /var/log/ceph/mon
osd_log: /var/log/ceph/osd
# 和操作系统共享一个分区,基于目录的OSD
osd_directory:
enabled: true
storageclass:
name: ceph-rbd
pool: rbd
1.4 创建K8S资源
为Ceph创建名字空间:
kubectl create namespace ceph
创建RBAC资源:
kubectl create -f ceph-helm/ceph/rbac.yaml
了部署Ceph集群,需要为K8S集群中,不同角色(参与到Ceph集群中的角色)的节点添加标签:
- ceph-mon=enabled,部署mon的节点上添加
- ceph-mgr=enabled,部署mgr的节点上添加
- ceph-osd=enabled,部署基于设备、基于目录的OSD的节点上添加
- ceph-osd-device-NAME=enabled。部署基于设备的OSD的节点上添加,其中NAME需要替换为上面 ceph-overrides.yaml中的OSD设备名,即:
- ceph-osd-device-dev-vdb=enabled
- ceph-osd-device-dev-vdc=enabled
对应的K8S命令:
# 部署Ceph Monitor的节点
kubectl label node xenial-100 ceph-mon=enabled ceph-mgr=enabled
# 对于每个OSD节点
kubectl label node xenial-100 ceph-osd=enabled ceph-osd-device-dev-vdb=enabled ceph-osd-device-dev-vdc=enabled
kubectl label node xenial-101 ceph-osd=enabled ceph-osd-device-dev-vdb=enabled ceph-osd-device-dev-vdc=enabled
1.5 Release
helm install --name=ceph local/ceph --namespace=ceph -f ceph-overrides.yaml
1.6 检查状态
确保所有Pod正常运行:
# kubectl -n ceph get pods
NAME READY STATUS RESTARTS AGE
ceph-mds-7cb7c647c7-7w6pc 0/1 Pending 0 18h
ceph-mgr-66cb85cbc6-hsm65 1/1 Running 3 1h
ceph-mon-check-758b88d88b-2r975 1/1 Running 1 1h
ceph-mon-gvtq6 3/3 Running 3 1h
ceph-osd-dev-vdb-clj5f 1/1 Running 15 1h
ceph-osd-dev-vdb-hldw5 1/1 Running 15 1h
ceph-osd-dev-vdb-l4v6t 1/1 Running 15 1h
ceph-osd-dev-vdb-v5jmd 1/1 Running 15 1h
ceph-osd-dev-vdb-wm4v4 1/1 Running 15 1h
ceph-osd-dev-vdb-zwr65 1/1 Running 15 1h
ceph-osd-dev-vdc-27wfk 1/1 Running 15 1h
ceph-osd-dev-vdc-4w4fn 1/1 Running 15 1h
ceph-osd-dev-vdc-cpkxh 1/1 Running 15 1h
ceph-osd-dev-vdc-twmwq 1/1 Running 15 1h
ceph-osd-dev-vdc-x8tpb 1/1 Running 15 1h
ceph-osd-dev-vdc-zfrll 1/1 Running 15 1h
ceph-rbd-provisioner-5544dcbcf5-n846s 1/1 Running 4 18h
ceph-rbd-provisioner-5544dcbcf5-t84bz 1/1 Running 3 18h
ceph-rgw-7f97b5b85d-nc5fq 0/1 Pending 0 18h
其中MDS、RGW的Pod处于Pending状态,这是由于没有给任何节点添加标签:
# rgw即RADOS Gateway,是Ceph的对象存储网关服务,它是基于librados接口封装的FastCGI服务
# 提供存储和管理对象数据的REST API。对象存储适用于图片、视频等各类文件
# rgw兼容常见的对象存储API,例如绝大部分Amazon S3 API、OpenStack Swift API
ceph-rgw=enabled
# mds即Metadata Server,用于支持文件系统
ceph-mds=enabled
现在从监控节点,检查一下Ceph集群的状态:
# kubectl -n ceph exec -ti ceph-mon-gvtq6 -c ceph-mon -- ceph -s
cluster:
# 集群标识符
id: 08adecc5-72b1-4c57-b5b7-a543cd8295e7
health: HEALTH_OK
services:
# 监控节点
mon: 1 daemons, quorum xenial-100
# 管理节点
mgr: xenial-100(active)
# OSD(Ceph Data Storage Daemon)
osd: 12 osds: 12 up, 12 in
data:
# 存储池、PG数量
pools: 0 pools, 0 pgs
# 对象数量
objects: 0 objects, 0 bytes
# 磁盘的用量,如果是基于文件系统的OSD,则操作系统用量也计算在其中
usage: 1292 MB used, 322 GB / 323 GB avail
# 所有PG都未激活,不可用
pgs: 100.000% pgs not active
# undersize是由于OSD数量不足(复制份数3,此时仅仅一个OSD),peerd表示128个PG配对到OSD
128 undersized+peered
# 将复制份数设置为1后,输出变为
pgs: 100.000% pgs not active
128 creating+peering
# 过了一小段时间后,输出变为
pgs: 128 active+clean
# 到这里,PVC才能被提供,否则PVC状态显示 Provisioning,Provisioner日志中出现类似下面的:
# attempting to acquire leader lease...
# successfully acquired lease to provision for pvc ceph/ceph-pvc
# stopped trying to renew lease to provision for pvc ceph/ceph-pvc, timeout reached
如果K8S集群没有默认StorageClass,可以设置:
kubectl patch storageclass ceph-rbd -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
这样没有显式声明StorageClass的PVC将自动通过ceph-rbd进行卷提供。
1.7 创建存储池
# 创建具有384个PG的名为rbd的复制存储池
ceph osd pool create rbd 384 replicated
ceph osd pool set rbd min_size 1
# 开发环境下,可以把Replica份数设置为1
ceph osd pool set rbd size 1
# min_size 会自动被设置的比size小
# 减小size后,可以立即看到ceph osd status的used变小
# 初始化池,最好在所有节点加入后,调整好CURSH Map后执行
rbd pool init rbd
# 可以创建额外的用户,例如下面的,配合Value storageclass.user_id=k8s使用
ceph auth get-or-create-key client.k8s mon 'allow r' osd 'allow rwx pool=rbd' | base64
# 如果使用默认用户admin,则不需要生成上面这步。admin权限也是足够的
# 其它命令
# 查看块设备使用情况(需要MGR)
ceph osd status
+----+------------+-------+-------+--------+---------+--------+---------+-----------+
| id | host | used | avail | wr ops | wr data | rd ops | rd data | state |
+----+------------+-------+-------+--------+---------+--------+---------+-----------+
| 0 | xenial-100 | 231M | 26.7G | 0 | 3276 | 0 | 0 | exists,up |
| 1 | xenial-103 | 216M | 26.7G | 0 | 819 | 0 | 0 | exists,up |
| 2 | xenial-101 | 253M | 26.7G | 0 | 0 | 0 | 0 | exists,up |
| 3 | xenial-103 | 286M | 26.7G | 0 | 0 | 0 | 0 | exists,up |
| 4 | xenial-101 | 224M | 26.7G | 0 | 1638 | 0 | 0 | exists,up |
| 5 | xenial-105 | 211M | 26.7G | 0 | 0 | 0 | 0 | exists,up |
| 6 | xenial-100 | 243M | 26.7G | 0 | 0 | 0 | 0 | exists,up |
| 7 | xenial-102 | 224M | 26.7G | 0 | 2457 | 0 | 0 | exists,up |
| 8 | xenial-102 | 269M | 26.7G | 0 | 1638 | 0 | 0 | exists,up |
| 9 | xenial-104 | 252M | 26.7G | 0 | 2457 | 0 | 0 | exists,up |
| 10 | xenial-104 | 231M | 26.7G | 0 | 0 | 0 | 0 | exists,up |
| 11 | xenial-105 | 206M | 26.7G | 0 | 0 | 0 | 0 | exists,up |
+----+------------+-------+-------+--------+---------+--------+---------+-----------+
1.8 使用存储池
可以先使用ceph命令尝试创建RBD并挂载:
# 镜像格式默认2
# format 1 - 此格式兼容所有版本的 librbd 和内核模块,但是不支持较新的功能,像克隆。此格式目前已经废弃
# 2 - librbd 和 3.11 版以上内核模块才支持。此格式增加了克隆支持,未来扩展更容易
rbd create test --size 1G --image-format 2 --image-feature layering
# 映射为本地块设备,如果卡住,可能有问题,一段时间后会有提示
rbd map test
# CentOS 7 下可能出现如下问题:
# rbd: sysfs write failed
# In some cases useful info is found in syslog - try "dmesg | tail".
# rbd: map failed: (5) Input/output error
# dmesg | tail
# [1180891.928386] libceph: mon0 10.5.39.41:6789 feature set mismatch,
# my 2b84a042a42 < server's 40102b84a042a42, missing 401000000000000
# [1180891.934804] libceph: mon0 10.5.39.41:6789 socket error on read
# 解决办法是把Bucket算法从straw2改为straw
# 挂载为目录
fdisk /dev/rbd0
mkfs.ext4 /dev/rbd0
mkdir /test
mount /dev/rbd0 /test
# 测试性能
# 1MB块写入
sync; dd if=/dev/zero of=/test/data bs=1M count=512; sync
# 512+0 records in
# 512+0 records out
# 536870912 bytes (537 MB) copied, 4.44723 s, 121 MB/s
# 16K随机写
fio -filename=/dev/rbd0 -direct=1 -iodepth 1 -thread -rw=randwrite -ioengine=psync -bs=16k -size=512M -numjobs=30 -runtime=60 -name=test
# WRITE: bw=35.7MiB/s (37.5MB/s), 35.7MiB/s-35.7MiB/s (37.5MB/s-37.5MB/s), io=2148MiB (2252MB), run=60111-60111msec
# 16K随机读
fio -filename=/dev/rbd0 -direct=1 -iodepth 1 -thread -rw=randread -ioengine=psync -bs=16k -size=512M -numjobs=30 -runtime=60 -name=test
# READ: bw=110MiB/s (116MB/s), 110MiB/s-110MiB/s (116MB/s-116MB/s), io=6622MiB (6943MB), run=60037-60037msec
# 删除测试镜像
umount /test
rbd unmap test
rbd remove test
确认Ceph RBD可以挂载、读写后,创建一个PVC:
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: ceph-pvc
namespace: ceph
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
storageClassName: ceph-rbd
查看PVC是否绑定到PV:
kubectl -n ceph create -f ceph-pvc.yaml
kubectl -n ceph get pvc
# NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
# ceph-pvc Bound pvc-43caef06-46b4-11e8-bed8-deadbeef00a0 1Gi RWO ceph-rbd 3s
# 在Monitor节点上确认RBD设备已经创建
rbd ls
# kubernetes-dynamic-pvc-fbddb77d-46b5-11e8-9204-8a12961e4b47
rbd info kubernetes-dynamic-pvc-fbddb77d-46b5-11e8-9204-8a12961e4b47
# rbd image 'kubernetes-dynamic-pvc-fbddb77d-46b5-11e8-9204-8a12961e4b47':
# size 128 MB in 32 objects
# order 22 (4096 kB objects)
# block_name_prefix: rbd_data.11412ae8944a
# format: 2
# features: layering
# flags:
# create_timestamp: Mon Apr 23 05:20:07 2018
需要在其它命名空间中使用此存储池时,拷贝一下Secret:
kubectl -n ceph get secrets/pvc-ceph-client-key -o json --export | jq '.metadata.namespace = "default"' | kubectl create -f -
1.9 卸载
helm delete ceph --purge
kubectl delete namespace ceph
此外,如果要重新安装,一定要把所有节点的一下目录清除掉:
rm -rf /var/lib/ceph-helm
rm -rf /var/lib/ceph
2. 使用 已存在Ceph集群
只需要安装相应的Provisioner,配置适当的StorageClass即可。示例:
- Provisioner:https://git.gmem.cc/alex/helm-charts/src/branch/master/ceph-provisioners
- 安装脚本:https://git.gmem.cc/alex/k8s-init/src/branch/master/4.infrastructure/0.ceph-external.sh
2.1 基于CephFS的卷
Kubernetes卷的动态Provisioning,目前需要依赖于external-storage项目,K8S没有提供内置的Provisioner。此项目存在不少问题,生产环境下可以考虑静态提供。
Provisioner会自动在Ceph集群的默认CephFS中创建“卷”,Ceph支持基于libcephfs+librados来实现一个基于CephFS目录的虚拟卷。
你可以在默认CephFS中看到volumes/kubernetes目录。kubernetes目录对应一个虚拟卷组。每个PV对应了它的一个子目录。