1. Helm部署

1.1 简介

为了将Ceph部署到K8S集群中,可以利用ceph-helm项目。 目前此项目存在一些限制:

  1. public和cluster网络必须一样
  2. 如果Storage的用户不是admin,你需要在Ceph集群中手工创建用户,并在K8S中创建对应的Secrets
  3. ceph-mgr只能运行单副本

1.2 仓库

执行下面的命令把ceph-helm添加到本地Helm仓库:

# 此项目使用Helm本地仓库保存Chart,如果没有启动本地存储,请启动
nohup /usr/local/bin/helm serve  --address 0.0.0.0:8879 > /dev/null 2>&1 &
 
git clone https://github.com/ceph/ceph-helm
pushd ceph-helm/ceph
make
popd
# 构建成功后Chart归档文件位于 ./ceph-0.1.0.tgz

1.3 覆盖值

可用值的说明如下:

ceph.yaml

# 部署哪些组件
deployment:
  ceph: true
  storage_secrets: true
  client_secrets: true
  rbd_provisioner: true
  rgw_keystone_user_and_endpoints: false
 
# 修改这些值可以指定其它镜像
images:
  ks_user: docker.io/kolla/ubuntu-source-heat-engine:3.0.3
  ks_service: docker.io/kolla/ubuntu-source-heat-engine:3.0.3
  ks_endpoints: docker.io/kolla/ubuntu-source-heat-engine:3.0.3
  bootstrap: docker.io/ceph/daemon:tag-build-master-luminous-ubuntu-16.04
  dep_check: docker.io/kolla/ubuntu-source-kubernetes-entrypoint:4.0.0
  daemon: docker.io/ceph/daemon:tag-build-master-luminous-ubuntu-16.04
  ceph_config_helper: docker.io/port/ceph-config-helper:v1.7.5
  # 如果使用官方提供的StorageClass,你需要扩展kube-controller镜像,否则报executable file not found in $PATH
  rbd_provisioner: quay.io/external_storage/rbd-provisioner:v0.1.1
  minimal: docker.io/alpine:latest
  pull_policy: "IfNotPresent"
 
# 不同Ceph组件使用什么节点选择器
labels:
  jobs:
    node_selector_key: ceph-mon
    node_selector_value: enabled
  mon:
    node_selector_key: ceph-mon
    node_selector_value: enabled
  mds:
    node_selector_key: ceph-mds
    node_selector_value: enabled
  osd:
    node_selector_key: ceph-osd
    node_selector_value: enabled
  rgw:
    node_selector_key: ceph-rgw
    node_selector_value: enabled
  mgr:
    node_selector_key: ceph-mgr
    node_selector_value: enabled
 
pod:
  dns_policy: "ClusterFirstWithHostNet"
  replicas:
    rgw: 1
    mon_check: 1
    rbd_provisioner: 2
    mgr: 1
  affinity:
      anti:
        type:
          default: preferredDuringSchedulingIgnoredDuringExecution
        topologyKey:
          default: kubernetes.io/hostname
  # 如果集群资源匮乏,可以调整下面的资源请求
  resources:
    enabled: false
    osd:
      requests:
        memory: "256Mi"
        cpu: "100m"
      limits:
        memory: "1024Mi"
        cpu: "1000m"
    mds:
      requests:
        memory: "10Mi"
        cpu: "100m"
      limits:
        memory: "50Mi"
        cpu: "500m"
    mon:
      requests:
        memory: "50Mi"
        cpu: "100m"
      limits:
        memory: "100Mi"
        cpu: "500m"
    mon_check:
      requests:
        memory: "5Mi"
        cpu: "100m"
      limits:
        memory: "50Mi"
        cpu: "500m"
    rgw:
      requests:
        memory: "5Mi"
        cpu: "100m"
      limits:
        memory: "50Mi"
        cpu: "500m"
    rbd_provisioner:
      requests:
        memory: "5Mi"
        cpu: "100m"
      limits:
        memory: "50Mi"
        cpu: "500m"
    mgr:
      requests:
        memory: "5Mi"
        cpu: "100m"
      limits:
        memory: "50Mi"
        cpu: "500m"
    jobs:
      bootstrap:
        limits:
          memory: "1024Mi"
          cpu: "2000m"
        requests:
          memory: "128Mi"
          cpu: "100m"
      secret_provisioning:
        limits:
          memory: "1024Mi"
          cpu: "2000m"
        requests:
          memory: "128Mi"
          cpu: "100m"
      ks_endpoints:
        requests:
          memory: "128Mi"
          cpu: "100m"
        limits:
          memory: "1024Mi"
          cpu: "2000m"
      ks_service:
        requests:
          memory: "128Mi"
          cpu: "100m"
        limits:
          memory: "1024Mi"
          cpu: "2000m"
      ks_user:
        requests:
          memory: "128Mi"
          cpu: "100m"
        limits:
          memory: "1024Mi"
          cpu: "2000m"
 
secrets:
  keyrings:
    mon: ceph-mon-keyring
    mds: ceph-bootstrap-mds-keyring
    osd: ceph-bootstrap-osd-keyring
    rgw: ceph-bootstrap-rgw-keyring
    mgr: ceph-bootstrap-mgr-keyring
    admin: ceph-client-admin-keyring
  identity:
    admin: ceph-keystone-admin
    user: ceph-keystone-user
    user_rgw: ceph-keystone-user-rgw
 
# !! 根据实际情况网络配置
network:
  public:   10.0.0.0/16
  cluster:  10.0.0.0/16
  port:
    mon: 6789
    rgw: 8088
 
# !! 在此添加需要的Ceph配置项
conf:
  # 对象存储网关服务相关
  rgw_ks:
    config:
      rgw_keystone_api_version: 3
      rgw_keystone_accepted_roles: "admin, _member_"
      rgw_keystone_implicit_tenants: true
      rgw_s3_auth_use_keystone: true
  ceph:
    override:
    append:
    config:
      global:
        mon_host: null
      osd:
        ms_bind_port_max: 7100
 
ceph:
  rgw_keystone_auth: false
  enabled:
    mds: true
    rgw: true
    mgr: true
  storage:
    # 基于目录的OSD,在宿主机上存储的路径
    # /var/lib/ceph-helm/osd会挂载到容器的/var/lib/ceph/osd目录
    osd_directory: /var/lib/ceph-helm
    mon_directory: /var/lib/ceph-helm
    # 将日志收集到/var/log,便于fluentd来采集
    mon_log: /var/log/ceph/mon
    osd_log: /var/log/ceph/osd
 
# !! 是否启用基于目录的OSD,需要配合节点标签ceph-osd=enabled
# 存储的位置由上面的storage.osd_directory确定,沿用现有的文件系统
osd_directory:
  enabled: false
 
# 如果设置为1,则允许Ceph格式化磁盘,这会导致数据丢失
enable_zap_and_potentially_lose_data: true
# !! 基于块设备的OSD,需要配合节点标签ceph-osd-device-dev-***=enabled
osd_devices:
  - name: dev-vdb
    # 使用的块设备
    device: /dev/vdb
    # 日志可以存储到独立块设备上,提升性能,如果不指定,存放在device
    journal: /dev/vdc
    # 是否删除其分区表
    zap: "1"
 
bootstrap:
  enabled: false
  script: |
    ceph -s
    function ensure_pool () {
      ceph osd pool stats $1 || ceph osd pool create $1 $2
    }
    ensure_pool volumes 8
 
# 启用的mgr模块
ceph_mgr_enabled_modules:
  - restful
  - status
 
# 配置mgr模块
ceph_mgr_modules_config:
  dashboard:
    port: 7000
  localpool:
    failure_domain: host
    subtree: rack
    pg_num: "128"
    num_rep: "3"
    min_size: "2"
 
# 在部署/升级后,执行下面的命令
# 这些命令通过kubectl来执行
ceph_commands:
- ceph osd pool create  pg_num
- ceph osd crush tunables 
 
# Kubernetes 存储类配置
storageclass:
  provision_storage_class: true
  provisioner: ceph.com/rbd
  # 存储类名称
  name: ceph-rbd
  monitors: nullcurshmap.src
  # 使用的RBD存储池的名称
  pool: rbd
  admin_id: admin
  admin_secret_name: pvc-ceph-conf-combined-storageclass
  admin_secret_namespace: ceph
  user_id: admin
  user_secret_name: pvc-ceph-client-key
  # RBD设备的镜像格式和特性
  image_format: "2"
  image_features: layering
 
endpoints:
  # 集群域名后缀
  cluster_domain_suffix: k8s.gmem.cc
  identity:
    name: keystone
    namespace: null
    auth:
      admin:
        region_name: RegionOne
        username: admin
        password: password
        project_name: admin
        user_domain_name: default
        project_domain_name: default
      user:
        role: admin
        region_name: RegionOne
        username: swift
        password: password
        project_name: service
        user_domain_name: default
        project_domain_name: default
    hosts:
      default: keystone-api
      public: keystone
    host_fqdn_override:
      default: null
    path:
      default: /v3
    scheme:
      default: http
    port:
      admin:
        default: 35357
      api:
        default: 80
  object_store:
    name: swift
    namespace: null
    hosts:
      default: ceph-rgw
    host_fqdn_override:
      default: null
    path:
      default: /swift/v1
    scheme:
      default: http
    port:
      api:
        default: 8088
  ceph_mon:
    namespace: null
    hosts:
      default: ceph-mon
    host_fqdn_override:
      default: null
    port:
      mon:
        default: 6789

Ext4文件系统上基于目录的OSD配置,覆盖值示例:

network:
  public: 10.0.0.0/8
  cluster: 10.0.0.0/8
 
conf:
  ceph:
    config:
      global:
        # Ext4文件系统
        filestore_xattr_use_omap: true
      osd:
        ms_bind_port_max: 7100
        # Ext4文件系统
        osd_max_object_name_len: 256
        osd_max_object_namespace_len: 64
        osd_crush_update_on_start : false
 
ceph:
  storage:
    osd_directory: /var/lib/ceph-helm
    mon_directory: /var/lib/ceph-helm
    mon_log: /var/log/ceph/mon
    osd_log: /var/log/ceph/osd
 
# 和操作系统共享一个分区,基于目录的OSD
osd_directory:
  enabled: true
 
storageclass:
  name: ceph-rbd
  pool: rbd

1.4 创建K8S资源

为Ceph创建名字空间:

kubectl create namespace ceph

创建RBAC资源:

kubectl create -f ceph-helm/ceph/rbac.yaml

了部署Ceph集群,需要为K8S集群中,不同角色(参与到Ceph集群中的角色)的节点添加标签:

  1. ceph-mon=enabled,部署mon的节点上添加
  2. ceph-mgr=enabled,部署mgr的节点上添加
  3. ceph-osd=enabled,部署基于设备、基于目录的OSD的节点上添加
  4. ceph-osd-device-NAME=enabled。部署基于设备的OSD的节点上添加,其中NAME需要替换为上面 ceph-overrides.yaml中的OSD设备名,即:
  1. ceph-osd-device-dev-vdb=enabled
  2. ceph-osd-device-dev-vdc=enabled

对应的K8S命令:

# 部署Ceph Monitor的节点
kubectl label node xenial-100 ceph-mon=enabled ceph-mgr=enabled
# 对于每个OSD节点
kubectl label node xenial-100 ceph-osd=enabled ceph-osd-device-dev-vdb=enabled ceph-osd-device-dev-vdc=enabled
kubectl label node xenial-101 ceph-osd=enabled ceph-osd-device-dev-vdb=enabled ceph-osd-device-dev-vdc=enabled

1.5 Release

helm install --name=ceph local/ceph --namespace=ceph -f ceph-overrides.yaml

1.6 检查状态

确保所有Pod正常运行:

# kubectl -n ceph get pods
NAME                                    READY     STATUS    RESTARTS   AGE
ceph-mds-7cb7c647c7-7w6pc               0/1       Pending   0          18h
ceph-mgr-66cb85cbc6-hsm65               1/1       Running   3          1h
ceph-mon-check-758b88d88b-2r975         1/1       Running   1          1h
ceph-mon-gvtq6                          3/3       Running   3          1h
ceph-osd-dev-vdb-clj5f                  1/1       Running   15         1h
ceph-osd-dev-vdb-hldw5                  1/1       Running   15         1h
ceph-osd-dev-vdb-l4v6t                  1/1       Running   15         1h
ceph-osd-dev-vdb-v5jmd                  1/1       Running   15         1h
ceph-osd-dev-vdb-wm4v4                  1/1       Running   15         1h
ceph-osd-dev-vdb-zwr65                  1/1       Running   15         1h
ceph-osd-dev-vdc-27wfk                  1/1       Running   15         1h
ceph-osd-dev-vdc-4w4fn                  1/1       Running   15         1h
ceph-osd-dev-vdc-cpkxh                  1/1       Running   15         1h
ceph-osd-dev-vdc-twmwq                  1/1       Running   15         1h
ceph-osd-dev-vdc-x8tpb                  1/1       Running   15         1h
ceph-osd-dev-vdc-zfrll                  1/1       Running   15         1h
ceph-rbd-provisioner-5544dcbcf5-n846s   1/1       Running   4          18h
ceph-rbd-provisioner-5544dcbcf5-t84bz   1/1       Running   3          18h
ceph-rgw-7f97b5b85d-nc5fq               0/1       Pending   0          18h

其中MDS、RGW的Pod处于Pending状态,这是由于没有给任何节点添加标签:

# rgw即RADOS Gateway,是Ceph的对象存储网关服务,它是基于librados接口封装的FastCGI服务
# 提供存储和管理对象数据的REST API。对象存储适用于图片、视频等各类文件
# rgw兼容常见的对象存储API,例如绝大部分Amazon S3 API、OpenStack Swift API
ceph-rgw=enabled
# mds即Metadata Server,用于支持文件系统
ceph-mds=enabled

现在从监控节点,检查一下Ceph集群的状态:

# kubectl -n ceph exec -ti ceph-mon-gvtq6 -c ceph-mon -- ceph -s
  cluster:
    # 集群标识符
    id:     08adecc5-72b1-4c57-b5b7-a543cd8295e7
    health: HEALTH_OK
 
  services:
    # 监控节点
    mon: 1 daemons, quorum xenial-100
    # 管理节点
    mgr: xenial-100(active)
    # OSD(Ceph Data Storage Daemon)
    osd: 12 osds: 12 up, 12 in
  
  data:
    # 存储池、PG数量
    pools:   0 pools, 0 pgs
    # 对象数量
    objects: 0 objects, 0 bytes
    # 磁盘的用量,如果是基于文件系统的OSD,则操作系统用量也计算在其中
    usage:   1292 MB used, 322 GB / 323 GB avail
 
    # 所有PG都未激活,不可用
    pgs:     100.000% pgs not active
             # undersize是由于OSD数量不足(复制份数3,此时仅仅一个OSD),peerd表示128个PG配对到OSD
             128 undersized+peered
    # 将复制份数设置为1后,输出变为
    pgs:     100.000% pgs not active
             128 creating+peering
    # 过了一小段时间后,输出变为
    pgs: 128 active+clean
    # 到这里,PVC才能被提供,否则PVC状态显示 Provisioning,Provisioner日志中出现类似下面的:
    # attempting to acquire leader lease...
    # successfully acquired lease to provision for pvc ceph/ceph-pvc
    # stopped trying to renew lease to provision for pvc ceph/ceph-pvc, timeout reached

如果K8S集群没有默认StorageClass,可以设置:

kubectl patch storageclass ceph-rbd -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'

这样没有显式声明StorageClass的PVC将自动通过ceph-rbd进行卷提供。

1.7 创建存储池

# 创建具有384个PG的名为rbd的复制存储池
ceph osd pool create rbd 384 replicated
ceph osd pool set rbd min_size 1
 
# 开发环境下,可以把Replica份数设置为1
ceph osd pool set rbd size 1
# min_size 会自动被设置的比size小
# 减小size后,可以立即看到ceph osd status的used变小
 
# 初始化池,最好在所有节点加入后,调整好CURSH Map后执行
rbd pool init rbd
 
# 可以创建额外的用户,例如下面的,配合Value storageclass.user_id=k8s使用
ceph auth get-or-create-key client.k8s mon 'allow r' osd 'allow rwx pool=rbd' | base64
# 如果使用默认用户admin,则不需要生成上面这步。admin权限也是足够的
 
 
# 其它命令
# 查看块设备使用情况(需要MGR)
ceph osd status
+----+------------+-------+-------+--------+---------+--------+---------+-----------+
| id |    host    |  used | avail | wr ops | wr data | rd ops | rd data |   state   |
+----+------------+-------+-------+--------+---------+--------+---------+-----------+
| 0  | xenial-100 |  231M | 26.7G |    0   |  3276   |    0   |     0   | exists,up |
| 1  | xenial-103 |  216M | 26.7G |    0   |   819   |    0   |     0   | exists,up |
| 2  | xenial-101 |  253M | 26.7G |    0   |     0   |    0   |     0   | exists,up |
| 3  | xenial-103 |  286M | 26.7G |    0   |     0   |    0   |     0   | exists,up |
| 4  | xenial-101 |  224M | 26.7G |    0   |  1638   |    0   |     0   | exists,up |
| 5  | xenial-105 |  211M | 26.7G |    0   |     0   |    0   |     0   | exists,up |
| 6  | xenial-100 |  243M | 26.7G |    0   |     0   |    0   |     0   | exists,up |
| 7  | xenial-102 |  224M | 26.7G |    0   |  2457   |    0   |     0   | exists,up |
| 8  | xenial-102 |  269M | 26.7G |    0   |  1638   |    0   |     0   | exists,up |
| 9  | xenial-104 |  252M | 26.7G |    0   |  2457   |    0   |     0   | exists,up |
| 10 | xenial-104 |  231M | 26.7G |    0   |     0   |    0   |     0   | exists,up |
| 11 | xenial-105 |  206M | 26.7G |    0   |     0   |    0   |     0   | exists,up |
+----+------------+-------+-------+--------+---------+--------+---------+-----------+

1.8 使用存储池

可以先使用ceph命令尝试创建RBD并挂载:

# 镜像格式默认2
# format 1 - 此格式兼容所有版本的 librbd 和内核模块,但是不支持较新的功能,像克隆。此格式目前已经废弃
# 2 - librbd 和 3.11 版以上内核模块才支持。此格式增加了克隆支持,未来扩展更容易
rbd create  test --size 1G --image-format 2 --image-feature layering
 
# 映射为本地块设备,如果卡住,可能有问题,一段时间后会有提示
rbd map test
# CentOS 7 下可能出现如下问题:
#   rbd: sysfs write failed
#   In some cases useful info is found in syslog - try "dmesg | tail".
#   rbd: map failed: (5) Input/output error
# dmesg | tail
#   [1180891.928386] libceph: mon0 10.5.39.41:6789 feature set mismatch, 
#     my 2b84a042a42 < server's 40102b84a042a42, missing 401000000000000                                        
#   [1180891.934804] libceph: mon0 10.5.39.41:6789 socket error on read
# 解决办法是把Bucket算法从straw2改为straw
 
# 挂载为目录
fdisk /dev/rbd0
mkfs.ext4 /dev/rbd0
mkdir /test
mount /dev/rbd0 /test
 
# 测试性能
# 1MB块写入
sync; dd if=/dev/zero of=/test/data bs=1M count=512; sync
# 512+0 records in
# 512+0 records out
# 536870912 bytes (537 MB) copied, 4.44723 s, 121 MB/s
# 16K随机写
fio -filename=/dev/rbd0 -direct=1 -iodepth 1 -thread -rw=randwrite -ioengine=psync -bs=16k -size=512M -numjobs=30 -runtime=60 -name=test
# WRITE: bw=35.7MiB/s (37.5MB/s), 35.7MiB/s-35.7MiB/s (37.5MB/s-37.5MB/s), io=2148MiB (2252MB), run=60111-60111msec 
# 16K随机读
fio -filename=/dev/rbd0 -direct=1 -iodepth 1 -thread -rw=randread -ioengine=psync -bs=16k -size=512M -numjobs=30 -runtime=60 -name=test
# READ: bw=110MiB/s (116MB/s), 110MiB/s-110MiB/s (116MB/s-116MB/s), io=6622MiB (6943MB), run=60037-60037msec
 
# 删除测试镜像
umount /test
rbd unmap test
rbd remove test

确认Ceph RBD可以挂载、读写后,创建一个PVC:

kind: PersistentVolumeClaim
apiVersion: v1
metadata:
  name: ceph-pvc
  namespace: ceph
spec:
  accessModes:
   - ReadWriteOnce
  resources:
    requests:
       storage: 1Gi
  storageClassName: ceph-rbd

查看PVC是否绑定到PV:

kubectl -n ceph create -f ceph-pvc.yaml
 
kubectl -n ceph get pvc
 
# NAME       STATUS    VOLUME                                     CAPACITY   ACCESS MODES   STORAGECLASS   AGE
# ceph-pvc   Bound     pvc-43caef06-46b4-11e8-bed8-deadbeef00a0   1Gi        RWO            ceph-rbd       3s
 
# 在Monitor节点上确认RBD设备已经创建
rbd ls
# kubernetes-dynamic-pvc-fbddb77d-46b5-11e8-9204-8a12961e4b47
rbd info kubernetes-dynamic-pvc-fbddb77d-46b5-11e8-9204-8a12961e4b47
# rbd image 'kubernetes-dynamic-pvc-fbddb77d-46b5-11e8-9204-8a12961e4b47':
#         size 128 MB in 32 objects
#         order 22 (4096 kB objects)
#         block_name_prefix: rbd_data.11412ae8944a
#         format: 2
#         features: layering
#         flags: 
#         create_timestamp: Mon Apr 23 05:20:07 2018

需要在其它命名空间中使用此存储池时,拷贝一下Secret:

kubectl -n ceph get secrets/pvc-ceph-client-key -o json --export | jq '.metadata.namespace = "default"' | kubectl create -f -

1.9 卸载

helm delete ceph --purge
kubectl delete namespace ceph

此外,如果要重新安装,一定要把所有节点的一下目录清除掉:

rm -rf /var/lib/ceph-helm
rm -rf /var/lib/ceph

2. 使用 已存在Ceph集群

只需要安装相应的Provisioner,配置适当的StorageClass即可。示例:

  1. Provisioner:https://git.gmem.cc/alex/helm-charts/src/branch/master/ceph-provisioners
  2. 安装脚本:https://git.gmem.cc/alex/k8s-init/src/branch/master/4.infrastructure/0.ceph-external.sh

2.1 基于CephFS的卷

Kubernetes卷的动态Provisioning,目前需要依赖于external-storage项目,K8S没有提供内置的Provisioner。此项目存在不少问题,生产环境下可以考虑静态提供。

Provisioner会自动在Ceph集群的默认CephFS中创建“卷”,Ceph支持基于libcephfs+librados来实现一个基于CephFS目录的虚拟卷。

你可以在默认CephFS中看到volumes/kubernetes目录。kubernetes目录对应一个虚拟卷组。每个PV对应了它的一个子目录。