全局概览
测试环境为CentOS 7 X64
从RPM获取源码
$ cd ~/rpmbuild/
$ yumdownloader --source device-mapper-multipath
$ rpm -ivh device-mapper-multipath-0.4.9-85.el7_2.6.src.rpm
$ rpmbuild -ba SPECS/device-mapper-multipath.spec
$ ll RPMS/x86_64/ | awk '{print $9}'
device-mapper-multipath-0.4.9-85.el7.centos.6.x86_64.rpm
device-mapper-multipath-debuginfo-0.4.9-85.el7.centos.6.x86_64.rpm
device-mapper-multipath-libs-0.4.9-85.el7.centos.6.x86_64.rpm
device-mapper-multipath-sysvinit-0.4.9-85.el7.centos.6.x86_64.rpm
kpartx-0.4.9-85.el7.centos.6.x86_64.rpm
从官方或github获取源码
$ mkdir -pv /opt/multipath
$ cd /opt/multipath
$ git clone http://git.opensvc.com/multipath-tools/.git src
# or
$ git clone https://github.com/cathay4t/multipath-tools.git src
独立编译RPM源码
$ rpmbuild -bp SPECS/device-mapper-multipath.spec
$ mkdir -pv /opt/multipath
$ cp -rv BUILD/multipath-tools-130222/ /opt/multipath/
$ cd /opt/multipath/multipath-tools-130222/
$ make LIB=lib64
$ make install DESTDIR=/opt/multipath/target \
bindir=/usr/sbin \
syslibdir=/usr/lib64 \
libdir=/usr/lib64/multipath \
rcdir=/etc/rc.d/init.d \
unitdir=/usr/lib/systemd/system
查看目标文件
$ cd /opt/multipath/target/
$ tree
.
├── etc
│ └── rc.d
│ └── init.d
│ └── multipathd
└── usr
├── lib
│ ├── systemd
│ │ └── system
│ │ └── multipathd.service
│ └── udev
│ └── rules.d
│ ├── 11-dm-mpath.rules
│ └── 62-multipath.rules
├── lib64
│ ├── libmpathpersist.so -> libmpathpersist.so.0
│ ├── libmpathpersist.so.0
│ ├── libmultipath.so -> libmultipath.so.0
│ ├── libmultipath.so.0
│ └── multipath
│ ├── libcheckcciss_tur.so
│ ├── libcheckdirectio.so
│ ├── libcheckemc_clariion.so
│ ├── libcheckhp_sw.so
│ ├── libcheckhp_tur.so
│ ├── libcheckrdac.so
│ ├── libcheckreadsector0.so
│ ├── libchecktur.so
│ ├── libprioalua.so
│ ├── libprioconst.so
│ ├── libpriodatacore.so
│ ├── libprioemc.so
│ ├── libpriohds.so
│ ├── libpriohp_sw.so
│ ├── libprioiet.so
│ ├── libprioontap.so
│ ├── libpriorandom.so
│ ├── libpriordac.so
│ └── libprioweightedpath.so
├── sbin
│ ├── kpartx
│ ├── mpathconf
│ ├── mpathpersist
│ ├── multipath
│ └── multipathd
└── share
└── man
├── man3
│ ├── mpath_persistent_reserve_in.3.gz
│ └── mpath_persistent_reserve_out.3.gz
├── man5
│ └── multipath.conf.5.gz
└── man8
├── kpartx.8.gz
├── mpathconf.8.gz
├── mpathpersist.8.gz
├── multipath.8.gz
└── multipathd.8.gz
17 directories, 40 files
主要的文件有:
SysV和systemd启动脚本
/etc/rc.d/init.d/multipathd
/usr/lib/systemd/system/multipathd.service
udev命名规则
/usr/lib//udev/rules.d/11-dm-mpath.rules
/usr/lib//udev/rules.d/62-multipath.rules
用户工具
/usr/sbin/multipathd 守护进程,监听系统中路径状态的变化,并做相应的处理。
/usr/sbin/mpathpersist SCSI PR命令工具,主要用于隔离。
/usr/sbin/mpathconf 修改多路径配置
/usr/sbin/kpartx DeviceMapper虚拟设备创建工具
/usr/sbin/multipath 多路径命令工具
用户库
/usr/lib64/libmpathpersist.so.0
/usr/lib64/libmultipath.so
/usr/lib64/libmpathpersist.so
/usr/lib64/libmultipath.so.0
用户手册
/usr/share/man/*
插件
/usr/lib64/multipath/*
实例分析
这里有一个iSCSI设备,名称为iqn.2016-10.org.lr:storage,有两条路径,都处于活动状态。现在使用一些简单命令查看他们的状态,并简要分析相互关系,对其有一个初步认识。
查看多路径配置
$ cat /etc/multipath.conf
blacklist {
devnode "^(ram|raw|loop|fd|md|dm-|sr|scd|st)[0-9]*"
devnode "^sd[a-b][0-9]*"
}
defaults {
user_friendly_names yes
path_grouping_policy multibus
failback immediate
no_path_retry fail
}
multipaths {
multipath {
wwid 360000000000000000e00000000010001
alias iscsi
}
}
除了一些基本的配置,这里把wwid为360000000000000000e00000000010001的SCSI设备命名为iscsi。
查看DeviceMapper映射表
$ dmsetup table
iscsi: 0 209715200 multipath 0 0 1 1 service-time 0 2 2 8:64 1 1 8:80 1 1
可以看到,多路径设备iscsi设备的全部扇区都是用multipath驱动进行映射,传递给驱动的参数是“0 0 1 1 service-time 0 2 2 8:64 1 1 8:80 1 1 ”,多路径对应的两个设备的设备号为“8:64”和“8:80”
查看磁盘块设备路径及其设备号
$ ll /dev/sd*
brw-rw---- 1 root disk 8, 0 11月 10 14:24 /dev/sda
brw-rw---- 1 root disk 8, 1 11月 10 14:24 /dev/sda1
brw-rw---- 1 root disk 8, 2 11月 10 14:24 /dev/sda2
brw-rw---- 1 root disk 8, 3 11月 10 14:24 /dev/sda3
brw-rw---- 1 root disk 8, 16 11月 10 14:24 /dev/sdb
brw-rw---- 1 root disk 8, 17 11月 10 14:24 /dev/sdb1
brw-rw---- 1 root disk 8, 32 11月 10 14:24 /dev/sdc
brw-rw---- 1 root disk 8, 33 11月 10 14:24 /dev/sdc1
brw-rw---- 1 root disk 8, 48 11月 10 14:25 /dev/sdd
brw-rw---- 1 root disk 8, 49 11月 30 11:49 /dev/sdd1
brw-rw---- 1 root disk 8, 50 11月 10 14:25 /dev/sdd2
brw-rw---- 1 root disk 8, 64 11月 10 14:24 /dev/sde
brw-rw---- 1 root disk 8, 80 11月 17 15:04 /dev/sdf
上述多路径设备号对应的两个设备的路径分别为/dev/sde和/dev/sdf。
查看被映射的多路径设备
$ ll /dev/mapper/
crw------- 1 root root 10, 236 11月 8 17:03 control
lrwxrwxrwx 1 root root 7 11月 30 10:47 iscsi -> ../dm-0
被映射的多路径设备/dev/mapper/iscsi实际上是/dev/dm-0的符号链接(其实LVM也是这样的)。
查看多路径创建的DeviceMapper设备
$ ll /dev/dm-*
brw-rw---- 1 root disk 253, 0 11月 30 10:47 /dev/dm-0
多路径创建的DeviceMapper设备的主设备号都是253。
查看全部块设备
$ ll /dev/block/
lrwxrwxrwx 1 root root 6 11月 8 17:03 11:0 -> ../sr0
lrwxrwxrwx 1 root root 7 11月 30 10:47 253:0 -> ../dm-0
lrwxrwxrwx 1 root root 6 11月 10 14:24 8:0 -> ../sda
lrwxrwxrwx 1 root root 7 11月 10 14:24 8:1 -> ../sda1
lrwxrwxrwx 1 root root 6 11月 10 14:24 8:16 -> ../sdb
lrwxrwxrwx 1 root root 7 11月 10 14:24 8:17 -> ../sdb1
lrwxrwxrwx 1 root root 7 11月 10 14:24 8:2 -> ../sda2
lrwxrwxrwx 1 root root 7 11月 10 14:24 8:3 -> ../sda3
lrwxrwxrwx 1 root root 6 11月 10 14:24 8:32 -> ../sdc
lrwxrwxrwx 1 root root 7 11月 10 14:24 8:33 -> ../sdc1
lrwxrwxrwx 1 root root 6 11月 10 14:25 8:48 -> ../sdd
lrwxrwxrwx 1 root root 7 11月 30 11:49 8:49 -> ../sdd1
lrwxrwxrwx 1 root root 7 11月 10 14:25 8:50 -> ../sdd2
lrwxrwxrwx 1 root root 6 11月 10 14:24 8:64 -> ../sde
lrwxrwxrwx 1 root root 6 11月 17 15:04 8:80 -> ../sdf
查看设备的WWID
$ /lib/udev/scsi_id -g -u /dev/sde
360000000000000000e00000000010001
$ /lib/udev/scsi_id -g -u /dev/sdf
360000000000000000e00000000010001
$ /lib/udev/scsi_id -g -u /dev/mapper/iscsi
360000000000000000e00000000010001
同一设备的两条路径,以及被映射的新设备的WWID相同
查看当前多路径信息
$ multipath -ll
iscsi (360000000000000000e00000000010001) dm-0 IET ,VIRTUAL-DISK
size=100G features='0' hwhandler='0' wp=rw
`-+- policy='service-time 0' prio=1 status=active
|- 3:0:0:1 sde 8:64 active ready running
`- 4:0:0:1 sdf 8:80 active ready running
总结
- Initator在使用两个路径连接名称为iqn.2016-10.org.lr:storage, WWID为360000000000000000e00000000010001的设备时创建两个块设备文件,路径分别为/dev/sde,/dev/sdf,他们的设备号分别为8:64和8:80;
- 多路径软件根据两个iSCSI块设备的相关信息,把他们确定为同一个设备的多条路径,并根据配置使用libdevicemapper的函数创建一个虚拟的块设备;
- libdevicemapper函数根据multipathd或者multipath提供的参数,创建DeviceMapper映射表,并加载multipath驱动,创建虚拟磁盘/dev/dm-0,设备号为253:0,同时创建其符号链接/dev/mapper/iscsi。
驱动层分析
源码获取
多路径的驱动源码在内核源码的drivers/md目录下,获取内核的基本步骤如下:
$ yum install rpmbuild rpmdevtools
$ rpmdev-setuptree
$ yumdownloader --source kernel
$ rpm -ivh kernel-*.src.rpm
$ rpmbuild -bp ~/rpmbuild/SPECS/kernel.spec
$ ls ~/rpmbuild/BUILD/kernel-*/linux-*/
编译配置选项分析
查看Kconfig
$ vi driver/md/Kconfig
...
config MD_MULTIPATH
tristate "Multipath I/O support"
depends on BLK_DEV_MD
help
MD_MULTIPATH provides a simple multi-path personality for use
the MD framework. It is not under active development. New
projects should consider using DM_MULTIPATH which has more
features and more testing.
If unsure, say N.
config DM_MULTIPATH
tristate "Multipath target"
depends on BLK_DEV_DM
# nasty syntax but means make DM_MULTIPATH independent
# of SCSI_DH if the latter isn't defined but if
# it is, DM_MULTIPATH must depend on it. We get a build
# error if SCSI_DH=m and DM_MULTIPATH=y
depends on SCSI_DH || !SCSI_DH
---help---
Allow volume managers to support multipath hardware.
config DM_MULTIPATH_QL
tristate "I/O Path Selector based on the number of in-flight I/Os"
depends on DM_MULTIPATH
---help---
This path selector is a dynamic load balancer which selects
the path with the least number of in-flight I/Os.
If unsure, say N.
config DM_MULTIPATH_ST
tristate "I/O Path Selector based on the service time"
depends on DM_MULTIPATH
---help---
This path selector is a dynamic load balancer which selects
the path expected to complete the incoming I/O in the shortest
time.
If unsure, say N.
...
其中MD_MULTIPATH为旧版驱动,已被抛弃,不建议使用。DM_MULTIPATH为新版驱动,DM_MULTIPATH_QL和DM_MULTIPATH_ST为两种路径选择算法。
查看Makefile
$ vi driver/md/Makefile
...
obj-$(CONFIG_MD_MULTIPATH) += multipath.o
dm-multipath-y += dm-path-selector.o dm-mpath.o
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
driver/md/dm-mpath.o
driver/md/dm-path-selector.o
driver/md/dm-round-robin.o
driver/md/dm-queue-length.o
driver/md/dm-service-time.o
...
如果使用新版多路径,则会编译dm-path-selector.c,dm-mpath.c和dm-round-robin.c。dm-mpath.c是多路径驱动的核心,主要是负责初始化一些数据结构,以及注册DeviceMapper的Target Type;而dm-path-selector.c是负责管理路径选择算法管理的库函数;dm-round-robin.c是必备的路径选择算法,在一条路径上完成指定的IO次数后就切换到下一条,不断循环。另外就是可选的两种路径选择算法,dm-service-time.c根据路径的吞吐量以及未完成的字节数选择负荷较轻的路径,dm-queue-length.c根据正在处理的IO个数较少的那个。
dm-mpath.c 分析
初始化
static struct target_type multipath_target = {
.name = "multipath",
.version = {1, 9, 0},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
.map_rq = multipath_map,
.clone_and_map_rq = multipath_clone_and_map,
.release_clone_rq = multipath_release_clone,
.rq_end_io = multipath_end_io,
.presuspend = multipath_presuspend,
.postsuspend = multipath_postsuspend,
.resume = multipath_resume,
.status = multipath_status,
.message = multipath_message,
.ioctl = multipath_ioctl,
.iterate_devices = multipath_iterate_devices,
.busy = multipath_busy,
};
static int __init dm_multipath_init(void)
{
int r;
/* allocate a slab for the dm_ios */
_mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
if (!_mpio_cache)
return -ENOMEM;
r = dm_register_target(&multipath_target);
if (r < 0) {
DMERR("register failed %d", r);
r = -EINVAL;
goto bad_register_target;
}
kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
if (!kmultipathd) {
DMERR("failed to create workqueue kmpathd");
r = -ENOMEM;
goto bad_alloc_kmultipathd;
}
/*
* A separate workqueue is used to handle the device handlers
* to avoid overloading existing workqueue. Overloading the
* old workqueue would also create a bottleneck in the
* path of the storage hardware device activation.
*/
kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
WQ_MEM_RECLAIM);
if (!kmpath_handlerd) {
DMERR("failed to create workqueue kmpath_handlerd");
r = -ENOMEM;
goto bad_alloc_kmpath_handlerd;
}
DMINFO("version %u.%u.%u loaded",
multipath_target.version[0], multipath_target.version[1],
multipath_target.version[2]);
return 0;
bad_alloc_kmpath_handlerd:
destroy_workqueue(kmultipathd);
bad_alloc_kmultipathd:
dm_unregister_target(&multipath_target);
bad_register_target:
kmem_cache_destroy(_mpio_cache);
return r;
}
主要做了以下几件事:
- 创建一个slab缓冲区,用于后续多路径中需要快速申请和释放内存的地方使用;
- 向DeviceMapper框架中注册多路径的Target Type;
- 申请两个工作队列,用于异步执行耗时的IO操作。
构造实例
static int multipath_ctr(struct dm_target *ti, unsigned int argc,
char **argv)
{
/* target arguments */
static struct dm_arg _args[] = {
{0, 1024, "invalid number of priority groups"},
{0, 1024, "invalid initial priority group number"},
};
int r;
struct multipath *m;
struct dm_arg_set as;
unsigned pg_count = 0;
unsigned next_pg_num;
as.argc = argc;
as.argv = argv;
m = alloc_multipath(ti);
if (!m) {
ti->error = "can't allocate multipath";
return -EINVAL;
}
r = parse_features(&as, m);
if (r)
goto bad;
r = parse_hw_handler(&as, m);
if (r)
goto bad;
r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
if (r)
goto bad;
r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
if (r)
goto bad;
if ((!m->nr_priority_groups && next_pg_num) ||
(m->nr_priority_groups && !next_pg_num)) {
ti->error = "invalid initial priority group";
r = -EINVAL;
goto bad;
}
/* parse the priority groups */
while (as.argc) {
struct priority_group *pg;
pg = parse_priority_group(&as, m);
if (IS_ERR(pg)) {
r = PTR_ERR(pg);
goto bad;
}
m->nr_valid_paths += pg->nr_pgpaths;
list_add_tail(&pg->list, &m->priority_groups);
pg_count++;
pg->pg_num = pg_count;
if (!--next_pg_num)
m->next_pg = pg;
}
if (pg_count != m->nr_priority_groups) {
ti->error = "priority group count mismatch";
r = -EINVAL;
goto bad;
}
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_same_bios = 1;
return 0;
bad:
free_multipath(m);
return r;
}
当用户程序加载DeviceMapper映射表时,如果其中的映射类型为multipath,则会根据映射表中的参数调用此函数,以此构建一个Target Device,步骤为:
- 申请该实例内存,并进行简单初始化;
- 解析参数,并根据参数创建优先级组、并解析每个优先级组的路径算法;
- 设置一些基本的DeviceMapper属性。
一个设备的多条路径可以使用不同类型,划分分成多个优先组,每个组需要包含一个路径选择器(路径选择算法)以及至少一条路径。
参数组成部分
- 特性参数;
- 硬件参数;
- 优先级组以及路径参数。
参数解析实例
以上面的iscsi设备的映射表为例:
iscsi: 0 209715200 multipath 0 0 1 1 service-time 0 2 2 8:64 1 1 8:80 1 1
其中“0 0 1 1 service-time 0 2 2 8:64 1 1 8:80 1 1”为传递给驱动的参数。
0 : 特性参数个数为0,如果不为0,则后面会依次跟上pg_init_retries和pg_init_delay_msecs参数;
0 : 硬件参数个数为0,如果不为0,则后面会依次跟上相关硬件处理函数的参数;
1 : 优先级组个数;
1 : 下一个优先组序号,主要用于解析优先组时使用;
service-time : 当前优先组使用service-time算法;
0 : 算法的参数个数为0;
2 : 该优先组包含两条路径;
2 : 每条路径的参数有两个;
8:64 1 1 : 第一条路径的设备号为8:64,每进行1次IO就切换路径,路径的吞吐量权重值为1;
8:80 1 1 : 第二条路径的设备号为8:80,每进行1次IO就切换路径,路径的吞吐量权重值为1。
最终用户态程序看到的结果就是这样:
iscsi (360000000000000000e00000000010001) dm-0 IET ,VIRTUAL-DISK
size=100G features='0' hwhandler='0' wp=rw
`-+- policy='service-time 0' prio=1 status=active
|- 3:0:0:1 sde 8:64 active ready running
`- 4:0:0:1 sdf 8:80 active ready running
IO映射
基本映射步骤
static int multipath_map(struct dm_target *ti, struct request *clone,
union map_info *map_context)
{
return __multipath_map(ti, clone, map_context, NULL, NULL);
}
static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
union map_info *map_context,
struct request **clone)
{
return __multipath_map(ti, NULL, map_context, rq, clone);
}
static int __multipath_map(struct dm_target *ti, struct request *clone,
union map_info *map_context,
struct request *rq, struct request **__clone)
{
struct multipath *m = (struct multipath *) ti->private;
int r = DM_MAPIO_REQUEUE;
size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq);
struct pgpath *pgpath;
struct block_device *bdev;
struct dm_mpath_io *mpio;
spin_lock_irq(&m->lock);
/* Do we need to select a new pgpath? */
if (!m->current_pgpath ||
(!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
__choose_pgpath(m, nr_bytes);
pgpath = m->current_pgpath;
if (!pgpath) {
if (!__must_push_back(m))
r = -EIO; /* Failed */
goto out_unlock;
} else if (m->queue_io || m->pg_init_required) {
__pg_init_all_paths(m);
goto out_unlock;
}
if (set_mapinfo(m, map_context) < 0)
/* ENOMEM, requeue */
goto out_unlock;
mpio = map_context->ptr;
mpio->pgpath = pgpath;
mpio->nr_bytes = nr_bytes;
bdev = pgpath->path.dev->bdev;
spin_unlock_irq(&m->lock);
if (clone) {
/* Old request-based interface: allocated clone is passed in */
clone->q = bdev_get_queue(bdev);
clone->rq_disk = bdev->bd_disk;
clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
} else {
/* blk-mq request-based interface */
*__clone = blk_get_request(bdev_get_queue(bdev),
rq_data_dir(rq), GFP_ATOMIC);
if (IS_ERR(*__clone)) {
/* ENOMEM, requeue */
clear_mapinfo(m, map_context);
return r;
}
(*__clone)->bio = (*__clone)->biotail = NULL;
(*__clone)->rq_disk = bdev->bd_disk;
(*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT;
}
if (pgpath->pg->ps.type->start_io)
pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
&pgpath->path,
nr_bytes);
return DM_MAPIO_REMAPPED;
out_unlock:
spin_unlock_irq(&m->lock);
return r;
}
- 判断是否需要切换路径,并获取当前可用路径,把路径信息保存在请求的私有数据中;
- 根据路径信息重定向请求,设置其目的队列;
- 调用路径线算法的start_io函数;
- 如果成功则返回DM_MAPIO_REMAPPED,表明映射成功,通知DM框架重新投递请求。
IO完成
DeviceMapper映射完成后,进行IO结束时会调用multipath_end_io函数进行资源回收,首先调用do_end_io结束IO,最后会清除在开始映射时创建的上下文信息,并调用路径选择算法的end_io函数。
static int multipath_end_io(struct dm_target *ti, struct request *clone,
int error, union map_info *map_context)
{
struct multipath *m = ti->private;
struct dm_mpath_io *mpio = map_context->ptr;
struct pgpath *pgpath;
struct path_selector *ps;
int r;
BUG_ON(!mpio);
r = do_end_io(m, clone, error, mpio);
pgpath = mpio->pgpath;
if (pgpath) {
ps = &pgpath->pg->ps;
if (ps->type->end_io)
ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
}
clear_mapinfo(m, map_context);
return r;
}
在do_end_io中,如果IO错误且pgpath不为空则使用fail_path函数通过uevent通知用户层此路径已经失效。如果设置了queue_if_no_path为0,则会返回DM_ENDIO_REQUEUE,DeviceMapper会把这个请求重新加入队列,否则直接返回-EIO错误。
static int do_end_io(struct multipath *m, struct request *clone,
int error, struct dm_mpath_io *mpio)
{
/*
* We don't queue any clone request inside the multipath target
* during end I/O handling, since those clone requests don't have
* bio clones. If we queue them inside the multipath target,
* we need to make bio clones, that requires memory allocation.
* (See drivers/md/dm.c:end_clone_bio() about why the clone requests
* don't have bio clones.)
* Instead of queueing the clone request here, we queue the original
* request into dm core, which will remake a clone request and
* clone bios for it and resubmit it later.
*/
int r = DM_ENDIO_REQUEUE;
unsigned long flags;
if (!error && !clone->errors)
return 0; /* I/O complete */
if (noretry_error(error))
return error;
if (mpio->pgpath)
fail_path(mpio->pgpath);
spin_lock_irqsave(&m->lock, flags);
if (!m->nr_valid_paths) {
if (!m->queue_if_no_path) {
if (!__must_push_back(m))
r = -EIO;
} else {
if (error == -EBADE)
r = error;
}
}
spin_unlock_irqrestore(&m->lock, flags);
return r;
}
static int fail_path(struct pgpath *pgpath)
{
unsigned long flags;
struct multipath *m = pgpath->pg->m;
spin_lock_irqsave(&m->lock, flags);
if (!pgpath->is_active)
goto out;
DMWARN("Failing path %s.", pgpath->path.dev->name);
pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
pgpath->is_active = 0;
pgpath->fail_count++;
m->nr_valid_paths--;
if (pgpath == m->current_pgpath)
m->current_pgpath = NULL;
dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
pgpath->path.dev->name, m->nr_valid_paths);
schedule_work(&m->trigger_event);
out:
spin_unlock_irqrestore(&m->lock, flags);
return 0;
}
消息处理
static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
{
int r = -EINVAL;
struct dm_dev *dev;
struct multipath *m = (struct multipath *) ti->private;
action_fn action;
mutex_lock(&m->work_mutex);
if (dm_suspended(ti)) {
r = -EBUSY;
goto out;
}
if (argc == 1) {
if (!strcasecmp(argv[0], "queue_if_no_path")) {
r = queue_if_no_path(m, 1, 0);
goto out;
} else if (!strcasecmp(argv[0], "fail_if_no_path")) {
r = queue_if_no_path(m, 0, 0);
goto out;
}
}
if (argc != 2) {
DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc);
goto out;
}
if (!strcasecmp(argv[0], "disable_group")) {
r = bypass_pg_num(m, argv[1], 1);
goto out;
} else if (!strcasecmp(argv[0], "enable_group")) {
r = bypass_pg_num(m, argv[1], 0);
goto out;
} else if (!strcasecmp(argv[0], "switch_group")) {
r = switch_pg_num(m, argv[1]);
goto out;
} else if (!strcasecmp(argv[0], "reinstate_path"))
action = reinstate_path;
else if (!strcasecmp(argv[0], "fail_path"))
action = fail_path;
else {
DMWARN("Unrecognised multipath message received: %s", argv[0]);
goto out;
}
r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev);
if (r) {
DMWARN("message: error getting device %s",
argv[1]);
goto out;
}
r = action_dev(m, dev, action);
dm_put_device(ti, dev);
out:
mutex_unlock(&m->work_mutex);
return r;
}
多路径驱动除了根据在给定的算法和配置下进行IO错误时以外,并不主动探测路径的状态;增加减少路径、以及状态改变都是由用户态的multipathd和multipath来控制完成,然后通过DeviceMapper框架提供的接口,调用Target Type驱动的Message函数完成。这里实现了切换优先组,失效以及使能路径等功能。
路径选择算法分析
路径选择器管理函数库
在dm-path-selector.h 和 dm-path-selector.c中实现,使用双向链表实现对路径算法的管理功能,主要包括注册、取消注册、获取和释放功能。
/* Register a path selector */
int dm_register_path_selector(struct path_selector_type *type);
/* Unregister a path selector */
int dm_unregister_path_selector(struct path_selector_type *type);
/* Returns a registered path selector type */
struct path_selector_type *dm_get_path_selector(const char *name);
/* Releases a path selector */
void dm_put_path_selector(struct path_selector_type *pst);
路径选择算法需要填充并注册如下结构:
struct path_selector_type {
char *name;
struct module *module;
unsigned int table_args;
unsigned int info_args;
/*
* Constructs a path selector object, takes custom arguments
*/
int (*create) (struct path_selector *ps, unsigned argc, char **argv);
void (*destroy) (struct path_selector *ps);
/*
* Add an opaque path object, along with some selector specific
* path args (eg, path priority).
*/
int (*add_path) (struct path_selector *ps, struct dm_path *path,
int argc, char **argv, char **error);
/*
* Chooses a path for this io, if no paths are available then
* NULL will be returned.
*
* repeat_count is the number of times to use the path before
* calling the function again. 0 means don't call it again unless
* the path fails.
*/
struct dm_path *(*select_path) (struct path_selector *ps,
unsigned *repeat_count,
size_t nr_bytes);
/*
* Notify the selector that a path has failed.
*/
void (*fail_path) (struct path_selector *ps, struct dm_path *p);
/*
* Ask selector to reinstate a path.
*/
int (*reinstate_path) (struct path_selector *ps, struct dm_path *p);
/*
/*
* Table content based on parameters added in ps_add_path_fn
* or path selector status
*/
int (*status) (struct path_selector *ps, struct dm_path *path,
status_type_t type, char *result, unsigned int maxlen);
int (*start_io) (struct path_selector *ps, struct dm_path *path,
size_t nr_bytes);
int (*end_io) (struct path_selector *ps, struct dm_path *path,
size_t nr_bytes);
};
- create - 实例化一个选择器;
- destroy - 销毁一个选择器;
- add_path - 向该选择器中增加一条路径;
- select_path - 选择进行IO的路径;
- fail_path - 告诉选择器该路径失效;
- reinstate_path - 告诉选择器该路径可用;
- status - 返回选择器状态;
- start_io - 使用者进行IO前必须调用该函数;
- end_io - 使用者完成IO后必须调用该函数。
round-robin 路径选择器
添加路径
每条路径只有一个参数,就是每条路径执行多少次IO后切换路径。
static int rr_add_path(struct path_selector *ps, struct dm_path *path,
int argc, char **argv, char **error)
{
struct selector *s = (struct selector *) ps->context;
struct path_info *pi;
unsigned repeat_count = RR_MIN_IO;
char dummy;
if (argc > 1) {
*error = "round-robin ps: incorrect number of arguments";
return -EINVAL;
}
/* First path argument is number of I/Os before switching path */
if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
*error = "round-robin ps: invalid repeat count";
return -EINVAL;
}
/* allocate the path */
pi = kmalloc(sizeof(*pi), GFP_KERNEL);
if (!pi) {
*error = "round-robin ps: Error allocating path context";
return -ENOMEM;
}
pi->path = path;
pi->repeat_count = repeat_count;
path->pscontext = pi;
list_add_tail(&pi->list, &s->valid_paths);
return 0;
}
路径选择策略
在每条路径向进行“repeat_count”次IO后就切换到下一条,不断循环。
static struct dm_path *rr_select_path(struct path_selector *ps,
unsigned *repeat_count, size_t nr_bytes)
{
struct selector *s = (struct selector *) ps->context;
struct path_info *pi = NULL;
if (!list_empty(&s->valid_paths)) {
pi = list_entry(s->valid_paths.next, struct path_info, list);
list_move_tail(&pi->list, &s->valid_paths);
*repeat_count = pi->repeat_count;
}
return pi ? pi->path : NULL;
}
service-time 路径选择器
添加路径
每条路径需要指定他的重复次数repeat_count,以吞吐量权重值relative_throughput。
static int st_add_path(struct path_selector *ps, struct dm_path *path,
int argc, char **argv, char **error)
{
struct selector *s = ps->context;
struct path_info *pi;
unsigned repeat_count = ST_MIN_IO;
unsigned relative_throughput = 1;
char dummy;
/*
* Arguments: [<repeat_count> [<relative_throughput>]]
* <repeat_count>: The number of I/Os before switching path.
* If not given, default (ST_MIN_IO) is used.
* <relative_throughput>: The relative throughput value of
* the path among all paths in the path-group.
* The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
* If not given, minimum value '1' is used.
* If '0' is given, the path isn't selected while
* other paths having a positive value are
* available.
*/
if (argc > 2) {
*error = "service-time ps: incorrect number of arguments";
return -EINVAL;
}
if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
*error = "service-time ps: invalid repeat count";
return -EINVAL;
}
if ((argc == 2) &&
(sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 ||
relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
*error = "service-time ps: invalid relative_throughput value";
return -EINVAL;
}
/* allocate the path */
pi = kmalloc(sizeof(*pi), GFP_KERNEL);
if (!pi) {
*error = "service-time ps: Error allocating path context";
return -ENOMEM;
}
pi->path = path;
pi->repeat_count = repeat_count;
pi->relative_throughput = relative_throughput;
atomic_set(&pi->in_flight_size, 0);
path->pscontext = pi;
list_add_tail(&pi->list, &s->valid_paths);
return 0;
}
路径选择策略
在IO开始与结束时,分别增减该路径正在处理的IO字节数。
static int st_start_io(struct path_selector *ps, struct dm_path *path,
size_t nr_bytes)
{
struct path_info *pi = path->pscontext;
atomic_add(nr_bytes, &pi->in_flight_size);
return 0;
}
static int st_end_io(struct path_selector *ps, struct dm_path *path,
size_t nr_bytes)
{
struct path_info *pi = path->pscontext;
atomic_sub(nr_bytes, &pi->in_flight_size);
return 0;
}
在多个路径中,选择正在处理的数据量与吞吐量比值最小的那条路径。
/*
* Compare the estimated service time of 2 paths, pi1 and pi2,
* for the incoming I/O.
*
* Returns:
* < 0 : pi1 is better
* 0 : no difference between pi1 and pi2
* > 0 : pi2 is better
*
* Description:
* Basically, the service time is estimated by:
* ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
* To reduce the calculation, some optimizations are made.
* (See comments inline)
*/
static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
size_t incoming)
{
size_t sz1, sz2, st1, st2;
sz1 = atomic_read(&pi1->in_flight_size);
sz2 = atomic_read(&pi2->in_flight_size);
/*
* Case 1: Both have same throughput value. Choose less loaded path.
*/
if (pi1->relative_throughput == pi2->relative_throughput)
return sz1 - sz2;
/*
* Case 2a: Both have same load. Choose higher throughput path.
* Case 2b: One path has no throughput value. Choose the other one.
*/
if (sz1 == sz2 ||
!pi1->relative_throughput || !pi2->relative_throughput)
return pi2->relative_throughput - pi1->relative_throughput;
/*
* Case 3: Calculate service time. Choose faster path.
* Service time using pi1:
* st1 = (sz1 + incoming) / pi1->relative_throughput
* Service time using pi2:
* st2 = (sz2 + incoming) / pi2->relative_throughput
*
* To avoid the division, transform the expression to use
* multiplication.
* Because ->relative_throughput > 0 here, if st1 < st2,
* the expressions below are the same meaning:
* (sz1 + incoming) / pi1->relative_throughput <
* (sz2 + incoming) / pi2->relative_throughput
* (sz1 + incoming) * pi2->relative_throughput <
* (sz2 + incoming) * pi1->relative_throughput
* So use the later one.
*/
sz1 += incoming;
sz2 += incoming;
if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
sz2 >= ST_MAX_INFLIGHT_SIZE)) {
/*
* Size may be too big for multiplying pi->relative_throughput
* and overflow.
* To avoid the overflow and mis-selection, shift down both.
*/
sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
}
st1 = sz1 * pi2->relative_throughput;
st2 = sz2 * pi1->relative_throughput;
if (st1 != st2)
return st1 - st2;
/*
* Case 4: Service time is equal. Choose higher throughput path.
*/
return pi2->relative_throughput - pi1->relative_throughput;
}
static struct dm_path *st_select_path(struct path_selector *ps,
unsigned *repeat_count, size_t nr_bytes)
{
struct selector *s = ps->context;
struct path_info *pi = NULL, *best = NULL;
if (list_empty(&s->valid_paths))
return NULL;
/* Change preferred (first in list) path to evenly balance. */
list_move_tail(s->valid_paths.next, &s->valid_paths);
list_for_each_entry(pi, &s->valid_paths, list)
if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
best = pi;
if (!best)
return NULL;
*repeat_count = best->repeat_count;
return best->path;
}
service-time 路径选择器
添加路径
只需要指定repeat_count即可。
static int ql_add_path(struct path_selector *ps, struct dm_path *path,
int argc, char **argv, char **error)
{
struct selector *s = ps->context;
struct path_info *pi;
unsigned repeat_count = QL_MIN_IO;
char dummy;
/*
* Arguments: [<repeat_count>]
* <repeat_count>: The number of I/Os before switching path.
* If not given, default (QL_MIN_IO) is used.
*/
if (argc > 1) {
*error = "queue-length ps: incorrect number of arguments";
return -EINVAL;
}
if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
*error = "queue-length ps: invalid repeat count";
return -EINVAL;
}
/* Allocate the path information structure */
pi = kmalloc(sizeof(*pi), GFP_KERNEL);
if (!pi) {
*error = "queue-length ps: Error allocating path information";
return -ENOMEM;
}
pi->path = path;
pi->repeat_count = repeat_count;
atomic_set(&pi->qlen, 0);
path->pscontext = pi;
list_add_tail(&pi->list, &s->valid_paths);
return 0;
}
路径选择策略
在IO开始和结束时,分别增加正在处理的IO个数。
static int ql_start_io(struct path_selector *ps, struct dm_path *path,
size_t nr_bytes)
{
struct path_info *pi = path->pscontext;
atomic_inc(&pi->qlen);
return 0;
}
static int ql_end_io(struct path_selector *ps, struct dm_path *path,
size_t nr_bytes)
{
struct path_info *pi = path->pscontext;
atomic_dec(&pi->qlen);
return 0;
}
选择正在处理的IO个数最少的那条路径。
static struct dm_path *ql_select_path(struct path_selector *ps,
unsigned *repeat_count, size_t nr_bytes)
{
struct selector *s = ps->context;
struct path_info *pi = NULL, *best = NULL;
if (list_empty(&s->valid_paths))
return NULL;
/* Change preferred (first in list) path to evenly balance. */
list_move_tail(s->valid_paths.next, &s->valid_paths);
list_for_each_entry(pi, &s->valid_paths, list) {
if (!best ||
(atomic_read(&pi->qlen) < atomic_read(&best->qlen)))
best = pi;
if (!atomic_read(&best->qlen))
break;
}
if (!best)
return NULL;
*repeat_count = best->repeat_count;
return best->path;
}
用户层分析
多路径在用户层主要有:共享库libmultipath和libmpathpersist,检测插件,守护进程multipathd,工具multipath等。libmpathpersist和mpathpersist是完成SCSI的PR命令处理的库和工具,kpartx用于创建DeviceMapper虚拟设备,这里我们主要分析multipathd和multipath,并在需要的地方引入libmultipath的分析。
- 路径发现 - 有两种方式,启动时通过扫描sys文件系统下的块设备并通过ioctl获取设备的属性,在运行时通过监听内核增删设备发出的uevent消息来获取设备信息。最后使用黑名单过滤之后,才会被加入路径列表中。
- 路径检测 - 主要是根据设定的间隔时间定时读取0扇区来判断路径是否正常,还有就是驱动在IO错误时会通知应用层。
- 路径切换 - 负载均衡切换由驱动根据算法调整,而failover,failback则由应用层检测后通知驱动层进行。另外切换时也可以配置切换方式(因存储不同而不同)。
multipathd
main函数在解析完参数后会调用child(multipathd/main.c)函数创建4个线程分别执行不同的任务:
static int child (void * param)
{
pthread_t check_thr, uevent_thr, uxlsnr_thr, uevq_thr;
pthread_attr_t log_attr, misc_attr, uevent_attr;
struct vectors * vecs;
struct multipath * mpp;
int i;
int rc, pid_rc;
mlockall(MCL_CURRENT | MCL_FUTURE);
sem_init(&exit_sem, 0, 0);
signal_init();
udev = udev_new();
setup_thread_attr(&misc_attr, 64 * 1024, 1);
setup_thread_attr(&uevent_attr, 128 * 1024, 1);
setup_thread_attr(&waiter_attr, 32 * 1024, 1);
if (logsink) {
setup_thread_attr(&log_attr, 64 * 1024, 0);
log_thread_start(&log_attr);
pthread_attr_destroy(&log_attr);
}
running_state = DAEMON_START;
condlog(2, "--------start up--------");
condlog(2, "read " DEFAULT_CONFIGFILE);
if (load_config(DEFAULT_CONFIGFILE, udev))
exit(1);
if (init_checkers()) {
condlog(0, "failed to initialize checkers");
exit(1);
}
if (init_prio()) {
condlog(0, "failed to initialize prioritizers");
exit(1);
}
setlogmask(LOG_UPTO(conf->verbosity + 3));
if (conf->max_fds) {
struct rlimit fd_limit;
if (getrlimit(RLIMIT_NOFILE, &fd_limit) < 0) {
condlog(0, "can't get open fds limit: %s",
strerror(errno));
fd_limit.rlim_cur = 0;
fd_limit.rlim_max = 0;
}
if (fd_limit.rlim_cur < conf->max_fds) {
fd_limit.rlim_cur = conf->max_fds;
if (fd_limit.rlim_max < conf->max_fds)
fd_limit.rlim_max = conf->max_fds;
if (setrlimit(RLIMIT_NOFILE, &fd_limit) < 0) {
condlog(0, "can't set open fds limit to "
"%lu/%lu : %s",
fd_limit.rlim_cur, fd_limit.rlim_max,
strerror(errno));
} else {
condlog(3, "set open fds limit to %lu/%lu",
fd_limit.rlim_cur, fd_limit.rlim_max);
}
}
}
vecs = gvecs = init_vecs();
if (!vecs)
exit(1);
setscheduler();
set_oom_adj();
conf->daemon = 1;
udev_set_sync_support(0);
/*
* Start uevent listener early to catch events
*/
if ((rc = pthread_create(&uevent_thr, &uevent_attr, ueventloop, udev))) {
condlog(0, "failed to create uevent thread: %d", rc);
exit(1);
}
pthread_attr_destroy(&uevent_attr);
if ((rc = pthread_create(&uxlsnr_thr, &misc_attr, uxlsnrloop, vecs))) {
condlog(0, "failed to create cli listener: %d", rc);
exit(1);
}
/*
* fetch and configure both paths and multipaths
*/
running_state = DAEMON_CONFIGURE;
lock(vecs->lock);
if (configure(vecs, 1)) {
unlock(vecs->lock);
condlog(0, "failure during configuration");
exit(1);
}
unlock(vecs->lock);
/*
* start threads
*/
if ((rc = pthread_create(&check_thr, &misc_attr, checkerloop, vecs))) {
condlog(0,"failed to create checker loop thread: %d", rc);
exit(1);
}
if ((rc = pthread_create(&uevq_thr, &misc_attr, uevqloop, vecs))) {
condlog(0, "failed to create uevent dispatcher: %d", rc);
exit(1);
}
pthread_attr_destroy(&misc_attr);
/* Startup complete, create logfile */
pid_rc = pidfile_create(DEFAULT_PIDFILE, daemon_pid);
update_timestamp(1);
/* Ignore errors, we can live without */
running_state = DAEMON_RUNNING;
/*
* exit path
*/
while(sem_wait(&exit_sem) != 0); /* Do nothing */
running_state = DAEMON_SHUTDOWN;
lock(vecs->lock);
if (conf->queue_without_daemon == QUE_NO_DAEMON_OFF)
vector_foreach_slot(vecs->mpvec, mpp, i)
dm_queue_if_no_path(mpp->alias, 0);
remove_maps_and_stop_waiters(vecs);
unlock(vecs->lock);
pthread_cancel(check_thr);
pthread_cancel(uevent_thr);
pthread_cancel(uxlsnr_thr);
pthread_cancel(uevq_thr);
lock(vecs->lock);
free_pathvec(vecs->pathvec, FREE_PATHS);
vecs->pathvec = NULL;
unlock(vecs->lock);
/* Now all the waitevent threads will start rushing in. */
/* freeing vecs isn't worth the races
while (vecs->lock.depth > 0) {
sleep (1);
condlog(3, "Have %d wait event checkers threads to de-alloc,"
" waiting...", vecs->lock.depth);
}
pthread_mutex_destroy(vecs->lock.mutex);
FREE(vecs->lock.mutex);
vecs->lock.depth = 0;
vecs->lock.mutex = NULL;
FREE(vecs);
vecs = NULL;
*/
cleanup_checkers();
cleanup_prio();
dm_lib_release();
dm_lib_exit();
/* We're done here */
if (!pid_rc) {
condlog(3, "unlink pidfile");
unlink(DEFAULT_PIDFILE);
unlink(DEFAULT_TIMESTAMP_FILE);
}
condlog(2, "--------shut down-------");
if (logsink)
log_thread_stop();
/*
* Freeing config must be done after condlog() and dm_lib_exit(),
* because logging functions like dlog() and dm_write_log()
* reference the config.
*/
free_config(conf);
conf = NULL;
udev_unref(udev);
udev = NULL;
#ifdef _DEBUG_
dbg_free_final(NULL);
#endif
exit(0);
}
uevent监听线程
static void *
ueventloop (void * ap)
{
if (uevent_listen(udev))
condlog(0, "error starting uevent listener");
return NULL;
}
int uevent_listen(struct udev *udev)
{
int err = 2;
struct udev_monitor *monitor = NULL;
int fd, socket_flags;
int need_failback = 1;
/*
* Queue uevents for service by dedicated thread so that the uevent
* listening thread does not block on multipathd locks (vecs->lock)
* thereby not getting to empty the socket's receive buffer queue
* often enough.
*/
if (!udev) {
condlog(1, "no udev context");
return 1;
}
udev_ref(udev);
pthread_cleanup_push(uevq_stop, udev);
monitor = udev_monitor_new_from_netlink(udev, "udev");
if (!monitor) {
condlog(2, "failed to create udev monitor");
goto out;
}
#ifdef LIBUDEV_API_RECVBUF
if (udev_monitor_set_receive_buffer_size(monitor, 128 * 1024 * 1024))
condlog(2, "failed to increase buffer size");
#endif
fd = udev_monitor_get_fd(monitor);
if (fd < 0) {
condlog(2, "failed to get monitor fd");
goto out;
}
socket_flags = fcntl(fd, F_GETFL);
if (socket_flags < 0) {
condlog(2, "failed to get monitor socket flags : %s",
strerror(errno));
goto out;
}
if (fcntl(fd, F_SETFL, socket_flags & ~O_NONBLOCK) < 0) {
condlog(2, "failed to set monitor socket flags : %s",
strerror(errno));
goto out;
}
err = udev_monitor_filter_add_match_subsystem_devtype(monitor, "block",
NULL);
if (err)
condlog(2, "failed to create filter : %s", strerror(-err));
err = udev_monitor_enable_receiving(monitor);
if (err) {
condlog(2, "failed to enable receiving : %s", strerror(-err));
goto out;
}
while (1) {
int i = 0;
char *pos, *end;
struct uevent *uev;
struct udev_device *dev;
struct udev_list_entry *list_entry;
dev = udev_monitor_receive_device(monitor);
if (!dev) {
condlog(0, "failed getting udev device");
continue;
}
uev = alloc_uevent();
if (!uev) {
udev_device_unref(dev);
condlog(1, "lost uevent, oom");
continue;
}
pos = uev->buffer;
end = pos + HOTPLUG_BUFFER_SIZE + OBJECT_SIZE - 1;
udev_list_entry_foreach(list_entry, udev_device_get_properties_list_entry(dev)) {
const char *name, *value;
int bytes;
name = udev_list_entry_get_name(list_entry);
if (!name)
name = "(null)";
value = udev_list_entry_get_value(list_entry);
if (!value)
value = "(null)";
bytes = snprintf(pos, end - pos, "%s=%s", name,
value);
if (pos + bytes >= end) {
condlog(2, "buffer overflow for uevent");
break;
}
uev->envp[i] = pos;
pos += bytes;
*pos = '\0';
pos++;
if (strcmp(name, "DEVPATH") == 0)
uev->devpath = uev->envp[i] + 8;
if (strcmp(name, "ACTION") == 0)
uev->action = uev->envp[i] + 7;
i++;
if (i == HOTPLUG_NUM_ENVP - 1)
break;
}
uev->udev = dev;
uev->envp[i] = NULL;
condlog(3, "uevent '%s' from '%s'", uev->action, uev->devpath);
uev->kernel = strrchr(uev->devpath, '/');
if (uev->kernel)
uev->kernel++;
/* print payload environment */
for (i = 0; uev->envp[i] != NULL; i++)
condlog(5, "%s", uev->envp[i]);
/*
* Queue uevent and poke service pthread.
*/
pthread_mutex_lock(uevq_lockp);
list_add_tail(&uev->node, &uevq);
pthread_cond_signal(uev_condp);
pthread_mutex_unlock(uevq_lockp);
}
need_failback = 0;
out:
if (monitor)
udev_monitor_unref(monitor);
if (need_failback)
err = failback_listen();
pthread_cleanup_pop(1);
return err;
}
通过监听udev的信息,解析设备的添加删除信息,最后把它加入到服务线程的队列。
用户交互线程
static void *uxlsnrloop (void * ap)
{
if (cli_init())
return NULL;
set_handler_callback(LIST+PATHS, cli_list_paths);
set_handler_callback(LIST+PATHS+FMT, cli_list_paths_fmt);
set_handler_callback(LIST+MAPS, cli_list_maps);
set_handler_callback(LIST+STATUS, cli_list_status);
set_handler_callback(LIST+DAEMON, cli_list_daemon);
set_handler_callback(LIST+MAPS+STATUS, cli_list_maps_status);
set_handler_callback(LIST+MAPS+STATS, cli_list_maps_stats);
set_handler_callback(LIST+MAPS+FMT, cli_list_maps_fmt);
set_handler_callback(LIST+MAPS+TOPOLOGY, cli_list_maps_topology);
set_handler_callback(LIST+TOPOLOGY, cli_list_maps_topology);
set_handler_callback(LIST+MAP+TOPOLOGY, cli_list_map_topology);
set_handler_callback(LIST+CONFIG, cli_list_config);
set_handler_callback(LIST+BLACKLIST, cli_list_blacklist);
set_handler_callback(LIST+DEVICES, cli_list_devices);
set_handler_callback(LIST+WILDCARDS, cli_list_wildcards);
set_handler_callback(ADD+PATH, cli_add_path);
set_handler_callback(DEL+PATH, cli_del_path);
set_handler_callback(ADD+MAP, cli_add_map);
set_handler_callback(DEL+MAP, cli_del_map);
set_handler_callback(SWITCH+MAP+GROUP, cli_switch_group);
set_handler_callback(RECONFIGURE, cli_reconfigure);
set_handler_callback(SUSPEND+MAP, cli_suspend);
set_handler_callback(RESUME+MAP, cli_resume);
set_handler_callback(RESIZE+MAP, cli_resize);
set_handler_callback(RELOAD+MAP, cli_reload);
set_handler_callback(RESET+MAP, cli_reassign);
set_handler_callback(REINSTATE+PATH, cli_reinstate);
set_handler_callback(FAIL+PATH, cli_fail);
set_handler_callback(DISABLEQ+MAP, cli_disable_queueing);
set_handler_callback(RESTOREQ+MAP, cli_restore_queueing);
set_handler_callback(DISABLEQ+MAPS, cli_disable_all_queueing);
set_handler_callback(RESTOREQ+MAPS, cli_restore_all_queueing);
set_handler_callback(QUIT, cli_quit);
set_handler_callback(SHUTDOWN, cli_shutdown);
set_handler_callback(GETPRSTATUS+MAP, cli_getprstatus);
set_handler_callback(SETPRSTATUS+MAP, cli_setprstatus);
set_handler_callback(UNSETPRSTATUS+MAP, cli_unsetprstatus);
set_handler_callback(FORCEQ+DAEMON, cli_force_no_daemon_q);
set_handler_callback(RESTOREQ+DAEMON, cli_restore_no_daemon_q);
umask(077);
uxsock_listen(&uxsock_trigger, ap);
return NULL;
}
当使用-k参数启动multipathd时,就可以在控制台上与multipathd进行直接交互。
路径检查线程
static void *checkerloop (void *ap)
{
struct vectors *vecs;
struct path *pp;
int count = 0;
unsigned int i;
mlockall(MCL_CURRENT | MCL_FUTURE);
vecs = (struct vectors *)ap;
condlog(2, "path checkers start up");
/*
* init the path check interval
*/
vector_foreach_slot (vecs->pathvec, pp, i) {
pp->checkint = conf->checkint;
}
while (1) {
pthread_cleanup_push(cleanup_lock, &vecs->lock);
lock(vecs->lock);
pthread_testcancel();
condlog(4, "tick");
if (vecs->pathvec) {
vector_foreach_slot (vecs->pathvec, pp, i) {
check_path(vecs, pp);
}
}
if (vecs->mpvec) {
defered_failback_tick(vecs->mpvec);
retry_count_tick(vecs->mpvec);
missing_uev_message_tick(vecs->mpvec);
}
if (count)
count--;
else {
condlog(4, "map garbage collection");
mpvec_garbage_collector(vecs);
count = MAPGCINT;
}
lock_cleanup_pop(vecs->lock);
sleep(1);
}
return NULL;
}
遍历路径列表,代用check_path函数,使用配置的检查方法检查路径,根据检测结果改变路径状态,并通知内核。
void check_path (struct vectors * vecs, struct path * pp)
{
int newstate;
int new_path_up = 0;
int chkr_new_path_up = 0;
int oldchkrstate = pp->chkrstate;
if (!pp->mpp && (pp->missing_udev_info != INFO_MISSING ||
pp->retriggers >= conf->retrigger_tries))
return;
if (pp->tick && --pp->tick)
return; /* don't check this path yet */
if (!pp->mpp) {
pp->missing_udev_info = INFO_REQUESTED;
pp->retriggers++;
sysfs_attr_set_value(pp->udev, "uevent", "change",
strlen("change"));
return;
}
/*
* provision a next check soonest,
* in case we exit abnormaly from here
*/
pp->tick = conf->checkint;
newstate = path_offline(pp);
if (newstate == PATH_UP)
newstate = get_state(pp, 1);
else
checker_clear_message(&pp->checker);
if (newstate == PATH_WILD || newstate == PATH_UNCHECKED) {
condlog(2, "%s: unusable path", pp->dev);
pathinfo(pp, conf->hwtable, 0);
return;
}
/*
* Async IO in flight. Keep the previous path state
* and reschedule as soon as possible
*/
if (newstate == PATH_PENDING) {
pp->tick = 1;
return;
}
/*
* Synchronize with kernel state
*/
if (update_multipath_strings(pp->mpp, vecs->pathvec)) {
condlog(1, "%s: Could not synchronize with kernel state",
pp->dev);
pp->dmstate = PSTATE_UNDEF;
}
/* if update_multipath_strings orphaned the path, quit early */
if (!pp->mpp)
return;
if ((newstate == PATH_UP || newstate == PATH_GHOST) &&
pp->wait_checks > 0) {
if (pp->mpp && pp->mpp->nr_active > 0) {
pp->state = PATH_DELAYED;
pp->wait_checks--;
return;
} else
pp->wait_checks = 0;
}
pp->chkrstate = newstate;
if (newstate != pp->state) {
int oldstate = pp->state;
pp->state = newstate;
LOG_MSG(1, checker_message(&pp->checker));
/*
* upon state change, reset the checkint
* to the shortest delay
*/
pp->checkint = conf->checkint;
if (newstate == PATH_DOWN || newstate == PATH_SHAKY) {
/*
* proactively fail path in the DM
*/
if (oldstate == PATH_UP ||
oldstate == PATH_GHOST) {
fail_path(pp, 1);
if (pp->mpp->delay_wait_checks > 0 &&
pp->watch_checks > 0) {
pp->wait_checks = pp->mpp->delay_wait_checks;
pp->watch_checks = 0;
}
}else
fail_path(pp, 0);
/*
* cancel scheduled failback
*/
pp->mpp->failback_tick = 0;
pp->mpp->stat_path_failures++;
return;
}
if(newstate == PATH_UP || newstate == PATH_GHOST){
/*
* Reinitialize the prioritizer, in case something
* changed.
*/
prio_init(&pp->prio);
if ( pp->mpp && pp->mpp->prflag ){
/*
* Check Persistent Reservation.
*/
condlog(2, "%s: checking persistent reservation "
"registration", pp->dev);
mpath_pr_event_handle(pp);
}
}
/*
* reinstate this path
*/
if (oldstate != PATH_UP &&
oldstate != PATH_GHOST) {
if (pp->mpp->delay_watch_checks > 0)
pp->watch_checks = pp->mpp->delay_watch_checks;
reinstate_path(pp, 1);
} else {
if (pp->watch_checks > 0)
pp->watch_checks--;
reinstate_path(pp, 0);
}
new_path_up = 1;
if (oldchkrstate != PATH_UP && oldchkrstate != PATH_GHOST)
chkr_new_path_up = 1;
/*
* if at least one path is up in a group, and
* the group is disabled, re-enable it
*/
if (newstate == PATH_UP)
enable_group(pp);
}
else if (newstate == PATH_UP || newstate == PATH_GHOST) {
if (pp->dmstate == PSTATE_FAILED ||
pp->dmstate == PSTATE_UNDEF) {
/* Clear IO errors */
reinstate_path(pp, 0);
} else {
LOG_MSG(4, checker_message(&pp->checker));
if (pp->checkint != conf->max_checkint) {
/*
* double the next check delay.
* max at conf->max_checkint
*/
if (pp->checkint < (conf->max_checkint / 2))
pp->checkint = 2 * pp->checkint;
else
pp->checkint = conf->max_checkint;
}
if (pp->watch_checks > 0)
pp->watch_checks--;
pp->tick = pp->checkint;
condlog(4, "%s: delay next check %is",
pp->dev_t, pp->tick);
}
}
else if (newstate == PATH_DOWN) {
if (conf->log_checker_err == LOG_CHKR_ERR_ONCE)
LOG_MSG(3, checker_message(&pp->checker));
else
LOG_MSG(2, checker_message(&pp->checker));
}
pp->state = newstate;
if (pp->mpp->wait_for_udev)
return;
/*
* path prio refreshing
*/
condlog(4, "path prio refresh");
if (update_prio(pp, new_path_up) &&
(pp->mpp->pgpolicyfn == (pgpolicyfn *)group_by_prio) &&
pp->mpp->pgfailback == -FAILBACK_IMMEDIATE)
update_path_groups(pp->mpp, vecs, !new_path_up);
else if (need_switch_pathgroup(pp->mpp, 0)) {
if (pp->mpp->pgfailback > 0 &&
(new_path_up || pp->mpp->failback_tick <= 0))
pp->mpp->failback_tick =
pp->mpp->pgfailback + 1;
else if (pp->mpp->pgfailback == -FAILBACK_IMMEDIATE ||
(chkr_new_path_up && followover_should_failback(pp)))
switch_pathgroup(pp->mpp);
}
}
服务线程
static void *uevqloop (void * ap)
{
if (uevent_dispatch(&uev_trigger, ap))
condlog(0, "error starting uevent dispatcher");
return NULL;
}
/*
* Service the uevent queue.
*/
int uevent_dispatch(int (*uev_trigger)(struct uevent *, void * trigger_data),
void * trigger_data)
{
my_uev_trigger = uev_trigger;
my_trigger_data = trigger_data;
mlockall(MCL_CURRENT | MCL_FUTURE);
while (1) {
LIST_HEAD(uevq_tmp);
pthread_mutex_lock(uevq_lockp);
servicing_uev = 0;
/*
* Condition signals are unreliable,
* so make sure we only wait if we have to.
*/
if (list_empty(&uevq)) {
pthread_cond_wait(uev_condp, uevq_lockp);
}
servicing_uev = 1;
list_splice_init(&uevq, &uevq_tmp);
pthread_mutex_unlock(uevq_lockp);
if (!my_uev_trigger)
break;
service_uevq(&uevq_tmp);
}
condlog(3, "Terminating uev service queue");
uevq_cleanup(&uevq);
return 0;
}
/*
* Called with uevq_lockp held
*/
void
service_uevq(struct list_head *tmpq)
{
struct uevent *uev, *tmp;
list_for_each_entry_safe(uev, tmp, tmpq, node) {
list_del_init(&uev->node);
if (my_uev_trigger && my_uev_trigger(uev, my_trigger_data))
condlog(0, "uevent trigger error");
if (uev->udev)
udev_device_unref(uev->udev);
FREE(uev);
}
}
multipath
查看路径(multipath -l/-ll)
static int get_dm_mpvec (vector curmp, vector pathvec, char * refwwid)
{
int i;
struct multipath * mpp;
char params[PARAMS_SIZE], status[PARAMS_SIZE];
if (dm_get_maps(curmp))
return 1;
vector_foreach_slot (curmp, mpp, i) {
/*
* discard out of scope maps
*/
if (mpp->wwid && refwwid &&
strncmp(mpp->wwid, refwwid, WWID_SIZE)) {
condlog(3, "skip map %s: out of scope", mpp->alias);
free_multipath(mpp, KEEP_PATHS);
vector_del_slot(curmp, i);
i--;
continue;
}
if (conf->cmd == CMD_VALID_PATH)
continue;
dm_get_map(mpp->alias, &mpp->size, params);
condlog(3, "params = %s", params);
dm_get_status(mpp->alias, status);
condlog(3, "status = %s", status);
disassemble_map(pathvec, params, mpp);
/*
* disassemble_map() can add new paths to pathvec.
* If not in "fast list mode", we need to fetch information
* about them
*/
if (conf->cmd != CMD_LIST_SHORT)
update_paths(mpp);
if (conf->cmd == CMD_LIST_LONG)
mpp->bestpg = select_path_group(mpp);
disassemble_status(status, mpp);
if (conf->cmd == CMD_LIST_SHORT ||
conf->cmd == CMD_LIST_LONG)
print_multipath_topology(mpp, conf->verbosity);
if (conf->cmd == CMD_CREATE)
reinstate_paths(mpp);
}
return 0;
}
int dm_get_maps (vector mp)
{
struct multipath * mpp;
int r = 1;
int info;
struct dm_task *dmt;
struct dm_names *names;
unsigned next = 0;
if (!mp)
return 1;
if (!(dmt = dm_task_create(DM_DEVICE_LIST)))
return 1;
dm_task_no_open_count(dmt);
if (!dm_task_run(dmt))
goto out;
if (!(names = dm_task_get_names(dmt)))
goto out;
if (!names->dev) {
r = 0; /* this is perfectly valid */
goto out;
}
do {
info = dm_type(names->name, TGT_MPATH);
if (info <= 0)
goto next;
mpp = alloc_multipath();
if (!mpp)
goto out;
mpp->alias = STRDUP(names->name);
if (!mpp->alias)
goto out1;
if (info > 0) {
if (dm_get_map(names->name, &mpp->size, NULL))
goto out1;
dm_get_uuid(names->name, mpp->wwid);
dm_get_info(names->name, &mpp->dmi);
}
if (!vector_alloc_slot(mp))
goto out1;
vector_set_slot(mp, mpp);
mpp = NULL;
next:
next = names->next;
names = (void *) names + next;
} while (next);
r = 0;
goto out;
out1:
free_multipath(mpp, KEEP_PATHS);
out:
dm_task_destroy (dmt);
return r;
}
extern void path_group_prio_update (struct pathgroup * pgp)
{
int i;
int priority = 0;
struct path * pp;
pgp->enabled_paths = 0;
if (!pgp->paths) {
pgp->priority = 0;
return;
}
vector_foreach_slot (pgp->paths, pp, i) {
if (pp->state == PATH_UP ||
pp->state == PATH_GHOST) {
priority += pp->priority;
pgp->enabled_paths++;
}
}
if (pgp->enabled_paths)
pgp->priority = priority / pgp->enabled_paths;
else
pgp->priority = 0;
}
extern int select_path_group (struct multipath * mpp)
{
int i;
int max_priority = 0;
int bestpg = 1;
int max_enabled_paths = 1;
struct pathgroup * pgp;
if (!mpp->pg)
return 1;
vector_foreach_slot (mpp->pg, pgp, i) {
if (!pgp->paths)
continue;
path_group_prio_update(pgp);
if (pgp->enabled_paths) {
if (pgp->priority > max_priority) {
max_priority = pgp->priority;
max_enabled_paths = pgp->enabled_paths;
bestpg = i + 1;
} else if (pgp->priority == max_priority) {
if (pgp->enabled_paths > max_enabled_paths) {
max_enabled_paths = pgp->enabled_paths;
bestpg = i + 1;
}
}
}
}
return bestpg;
清空全部路径(multipath -F)
extern int dm_flush_maps (void)
{
int r = 0;
struct dm_task *dmt;
struct dm_names *names;
unsigned next = 0;
if (!(dmt = dm_task_create (DM_DEVICE_LIST)))
return 0;
dm_task_no_open_count(dmt);
if (!dm_task_run (dmt))
goto out;
if (!(names = dm_task_get_names (dmt)))
goto out;
if (!names->dev)
goto out;
do {
r |= dm_suspend_and_flush_map(names->name);
next = names->next;
names = (void *) names + next;
} while (next);
out:
dm_task_destroy (dmt);
return r;
}
extern int dm_suspend_and_flush_map (const char * mapname)
{
int s = 0, queue_if_no_path = 0;
unsigned long long mapsize;
char params[PARAMS_SIZE] = {0};
if (!dm_map_present(mapname))
return 0;
if (dm_type(mapname, TGT_MPATH) <= 0)
return 0; /* nothing to do */
if (!dm_get_map(mapname, &mapsize, params)) {
if (strstr(params, "queue_if_no_path"))
queue_if_no_path = 1;
}
if (queue_if_no_path)
s = dm_queue_if_no_path((char *)mapname, 0);
/* Leave queue_if_no_path alone if unset failed */
if (s)
queue_if_no_path = 0;
else
s = dm_simplecmd_flush(DM_DEVICE_SUSPEND, mapname, 0, 0);
if (!dm_flush_map(mapname)) {
condlog(4, "multipath map %s removed", mapname);
return 0;
}
condlog(2, "failed to remove multipath map %s", mapname);
dm_simplecmd_noflush(DM_DEVICE_RESUME, mapname, 0);
if (queue_if_no_path)
s = dm_queue_if_no_path((char *)mapname, 1);
return 1;
}
从路径列表中全出全部路径名称,并使用libdevicvemapper库函数删除虚拟多路径设备。