全局概览

测试环境为CentOS 7 X64

从RPM获取源码

$ cd ~/rpmbuild/
$ yumdownloader --source device-mapper-multipath
$ rpm -ivh device-mapper-multipath-0.4.9-85.el7_2.6.src.rpm
$ rpmbuild -ba SPECS/device-mapper-multipath.spec

$ ll RPMS/x86_64/ | awk '{print $9}'
device-mapper-multipath-0.4.9-85.el7.centos.6.x86_64.rpm
device-mapper-multipath-debuginfo-0.4.9-85.el7.centos.6.x86_64.rpm
device-mapper-multipath-libs-0.4.9-85.el7.centos.6.x86_64.rpm
device-mapper-multipath-sysvinit-0.4.9-85.el7.centos.6.x86_64.rpm
kpartx-0.4.9-85.el7.centos.6.x86_64.rpm

从官方或github获取源码

$ mkdir -pv /opt/multipath
$ cd /opt/multipath
$ git clone http://git.opensvc.com/multipath-tools/.git src
# or
$ git clone https://github.com/cathay4t/multipath-tools.git src

独立编译RPM源码

$ rpmbuild -bp SPECS/device-mapper-multipath.spec
$ mkdir -pv /opt/multipath
$ cp -rv BUILD/multipath-tools-130222/ /opt/multipath/

$ cd /opt/multipath/multipath-tools-130222/
$ make LIB=lib64
$ make install DESTDIR=/opt/multipath/target \
bindir=/usr/sbin \
syslibdir=/usr/lib64 \
libdir=/usr/lib64/multipath \
rcdir=/etc/rc.d/init.d \
unitdir=/usr/lib/systemd/system

查看目标文件

$ cd /opt/multipath/target/
$ tree
.
├── etc
│   └── rc.d
│       └── init.d
│           └── multipathd
└── usr
    ├── lib
    │   ├── systemd
    │   │   └── system
    │   │       └── multipathd.service
    │   └── udev
    │       └── rules.d
    │           ├── 11-dm-mpath.rules
    │           └── 62-multipath.rules
    ├── lib64
    │   ├── libmpathpersist.so -> libmpathpersist.so.0
    │   ├── libmpathpersist.so.0
    │   ├── libmultipath.so -> libmultipath.so.0
    │   ├── libmultipath.so.0
    │   └── multipath
    │       ├── libcheckcciss_tur.so
    │       ├── libcheckdirectio.so
    │       ├── libcheckemc_clariion.so
    │       ├── libcheckhp_sw.so
    │       ├── libcheckhp_tur.so
    │       ├── libcheckrdac.so
    │       ├── libcheckreadsector0.so
    │       ├── libchecktur.so
    │       ├── libprioalua.so
    │       ├── libprioconst.so
    │       ├── libpriodatacore.so
    │       ├── libprioemc.so
    │       ├── libpriohds.so
    │       ├── libpriohp_sw.so
    │       ├── libprioiet.so
    │       ├── libprioontap.so
    │       ├── libpriorandom.so
    │       ├── libpriordac.so
    │       └── libprioweightedpath.so
    ├── sbin
    │   ├── kpartx
    │   ├── mpathconf
    │   ├── mpathpersist
    │   ├── multipath
    │   └── multipathd
    └── share
        └── man
            ├── man3
            │   ├── mpath_persistent_reserve_in.3.gz
            │   └── mpath_persistent_reserve_out.3.gz
            ├── man5
            │   └── multipath.conf.5.gz
            └── man8
                ├── kpartx.8.gz
                ├── mpathconf.8.gz
                ├── mpathpersist.8.gz
                ├── multipath.8.gz
                └── multipathd.8.gz

17 directories, 40 files

主要的文件有:

SysV和systemd启动脚本

/etc/rc.d/init.d/multipathd
/usr/lib/systemd/system/multipathd.service

udev命名规则

/usr/lib//udev/rules.d/11-dm-mpath.rules
/usr/lib//udev/rules.d/62-multipath.rules

用户工具

/usr/sbin/multipathd		守护进程,监听系统中路径状态的变化,并做相应的处理。
/usr/sbin/mpathpersist		SCSI PR命令工具,主要用于隔离。
/usr/sbin/mpathconf			修改多路径配置
/usr/sbin/kpartx			DeviceMapper虚拟设备创建工具
/usr/sbin/multipath			多路径命令工具

用户库

/usr/lib64/libmpathpersist.so.0
/usr/lib64/libmultipath.so
/usr/lib64/libmpathpersist.so
/usr/lib64/libmultipath.so.0

用户手册

/usr/share/man/*

插件

/usr/lib64/multipath/*

实例分析

这里有一个iSCSI设备,名称为iqn.2016-10.org.lr:storage,有两条路径,都处于活动状态。现在使用一些简单命令查看他们的状态,并简要分析相互关系,对其有一个初步认识。

查看多路径配置

$ cat /etc/multipath.conf
blacklist {
  devnode "^(ram|raw|loop|fd|md|dm-|sr|scd|st)[0-9]*"
  devnode "^sd[a-b][0-9]*"
}

defaults {
  user_friendly_names yes
  path_grouping_policy multibus
  failback immediate
  no_path_retry fail
}

multipaths {
  multipath {
    wwid 360000000000000000e00000000010001
    alias iscsi
  }
}

除了一些基本的配置,这里把wwid为360000000000000000e00000000010001的SCSI设备命名为iscsi。

查看DeviceMapper映射表

$ dmsetup table
iscsi: 0 209715200 multipath 0 0 1 1 service-time 0 2 2 8:64 1 1 8:80 1 1

可以看到,多路径设备iscsi设备的全部扇区都是用multipath驱动进行映射,传递给驱动的参数是“0 0 1 1 service-time 0 2 2 8:64 1 1 8:80 1 1 ”,多路径对应的两个设备的设备号为“8:64”和“8:80”

查看磁盘块设备路径及其设备号

$ ll /dev/sd*
brw-rw---- 1 root disk 8,  0 11月 10 14:24 /dev/sda
brw-rw---- 1 root disk 8,  1 11月 10 14:24 /dev/sda1
brw-rw---- 1 root disk 8,  2 11月 10 14:24 /dev/sda2
brw-rw---- 1 root disk 8,  3 11月 10 14:24 /dev/sda3
brw-rw---- 1 root disk 8, 16 11月 10 14:24 /dev/sdb
brw-rw---- 1 root disk 8, 17 11月 10 14:24 /dev/sdb1
brw-rw---- 1 root disk 8, 32 11月 10 14:24 /dev/sdc
brw-rw---- 1 root disk 8, 33 11月 10 14:24 /dev/sdc1
brw-rw---- 1 root disk 8, 48 11月 10 14:25 /dev/sdd
brw-rw---- 1 root disk 8, 49 11月 30 11:49 /dev/sdd1
brw-rw---- 1 root disk 8, 50 11月 10 14:25 /dev/sdd2
brw-rw---- 1 root disk 8, 64 11月 10 14:24 /dev/sde
brw-rw---- 1 root disk 8, 80 11月 17 15:04 /dev/sdf

上述多路径设备号对应的两个设备的路径分别为/dev/sde和/dev/sdf。

查看被映射的多路径设备

$ ll /dev/mapper/
crw------- 1 root root 10, 236 11月  8 17:03 control
lrwxrwxrwx 1 root root       7 11月 30 10:47 iscsi -> ../dm-0

被映射的多路径设备/dev/mapper/iscsi实际上是/dev/dm-0的符号链接(其实LVM也是这样的)。

查看多路径创建的DeviceMapper设备

$ ll /dev/dm-*
brw-rw---- 1 root disk 253, 0 11月 30 10:47 /dev/dm-0

多路径创建的DeviceMapper设备的主设备号都是253。

查看全部块设备

$ ll /dev/block/
lrwxrwxrwx 1 root root 6 11月  8 17:03 11:0 -> ../sr0
lrwxrwxrwx 1 root root 7 11月 30 10:47 253:0 -> ../dm-0
lrwxrwxrwx 1 root root 6 11月 10 14:24 8:0 -> ../sda
lrwxrwxrwx 1 root root 7 11月 10 14:24 8:1 -> ../sda1
lrwxrwxrwx 1 root root 6 11月 10 14:24 8:16 -> ../sdb
lrwxrwxrwx 1 root root 7 11月 10 14:24 8:17 -> ../sdb1
lrwxrwxrwx 1 root root 7 11月 10 14:24 8:2 -> ../sda2
lrwxrwxrwx 1 root root 7 11月 10 14:24 8:3 -> ../sda3
lrwxrwxrwx 1 root root 6 11月 10 14:24 8:32 -> ../sdc
lrwxrwxrwx 1 root root 7 11月 10 14:24 8:33 -> ../sdc1
lrwxrwxrwx 1 root root 6 11月 10 14:25 8:48 -> ../sdd
lrwxrwxrwx 1 root root 7 11月 30 11:49 8:49 -> ../sdd1
lrwxrwxrwx 1 root root 7 11月 10 14:25 8:50 -> ../sdd2
lrwxrwxrwx 1 root root 6 11月 10 14:24 8:64 -> ../sde
lrwxrwxrwx 1 root root 6 11月 17 15:04 8:80 -> ../sdf

查看设备的WWID

$ /lib/udev/scsi_id -g -u /dev/sde
360000000000000000e00000000010001
$ /lib/udev/scsi_id -g -u /dev/sdf
360000000000000000e00000000010001
$ /lib/udev/scsi_id -g -u /dev/mapper/iscsi 
360000000000000000e00000000010001

同一设备的两条路径,以及被映射的新设备的WWID相同

查看当前多路径信息

$ multipath -ll
iscsi (360000000000000000e00000000010001) dm-0 IET     ,VIRTUAL-DISK    
size=100G features='0' hwhandler='0' wp=rw
`-+- policy='service-time 0' prio=1 status=active
  |- 3:0:0:1 sde 8:64 active ready running
  `- 4:0:0:1 sdf 8:80 active ready running

总结

  1. Initator在使用两个路径连接名称为iqn.2016-10.org.lr:storage, WWID为360000000000000000e00000000010001的设备时创建两个块设备文件,路径分别为/dev/sde,/dev/sdf,他们的设备号分别为8:64和8:80;
  2. 多路径软件根据两个iSCSI块设备的相关信息,把他们确定为同一个设备的多条路径,并根据配置使用libdevicemapper的函数创建一个虚拟的块设备;
  3. libdevicemapper函数根据multipathd或者multipath提供的参数,创建DeviceMapper映射表,并加载multipath驱动,创建虚拟磁盘/dev/dm-0,设备号为253:0,同时创建其符号链接/dev/mapper/iscsi。

驱动层分析

源码获取

多路径的驱动源码在内核源码的drivers/md目录下,获取内核的基本步骤如下:

$ yum install rpmbuild rpmdevtools
$ rpmdev-setuptree
$ yumdownloader --source kernel
$ rpm -ivh kernel-*.src.rpm
$ rpmbuild -bp ~/rpmbuild/SPECS/kernel.spec
$ ls ~/rpmbuild/BUILD/kernel-*/linux-*/

编译配置选项分析

查看Kconfig
$ vi driver/md/Kconfig
...
config MD_MULTIPATH
    tristate "Multipath I/O support"
    depends on BLK_DEV_MD
    help
      MD_MULTIPATH provides a simple multi-path personality for use
      the MD framework.  It is not under active development.  New
      projects should consider using DM_MULTIPATH which has more
      features and more testing.

      If unsure, say N.

config DM_MULTIPATH
	tristate "Multipath target"
	depends on BLK_DEV_DM
	# nasty syntax but means make DM_MULTIPATH independent
	# of SCSI_DH if the latter isn't defined but if
	# it is, DM_MULTIPATH must depend on it.  We get a build
	# error if SCSI_DH=m and DM_MULTIPATH=y
	depends on SCSI_DH || !SCSI_DH
	---help---
	  Allow volume managers to support multipath hardware.

config DM_MULTIPATH_QL
	tristate "I/O Path Selector based on the number of in-flight I/Os"
	depends on DM_MULTIPATH
	---help---
	  This path selector is a dynamic load balancer which selects
	  the path with the least number of in-flight I/Os.

	  If unsure, say N.

config DM_MULTIPATH_ST
	tristate "I/O Path Selector based on the service time"
	depends on DM_MULTIPATH
	---help---
	  This path selector is a dynamic load balancer which selects
	  the path expected to complete the incoming I/O in the shortest
	  time.

	  If unsure, say N.
...

其中MD_MULTIPATH为旧版驱动,已被抛弃,不建议使用。DM_MULTIPATH为新版驱动,DM_MULTIPATH_QL和DM_MULTIPATH_ST为两种路径选择算法。

查看Makefile
$ vi driver/md/Makefile
...
obj-$(CONFIG_MD_MULTIPATH)  += multipath.o

dm-multipath-y  += dm-path-selector.o dm-mpath.o

obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o

obj-$(CONFIG_DM_MULTIPATH_QL)	+= dm-queue-length.o
obj-$(CONFIG_DM_MULTIPATH_ST)	+= dm-service-time.o

driver/md/dm-mpath.o
driver/md/dm-path-selector.o

driver/md/dm-round-robin.o
driver/md/dm-queue-length.o
driver/md/dm-service-time.o
...

如果使用新版多路径,则会编译dm-path-selector.c,dm-mpath.c和dm-round-robin.c。dm-mpath.c是多路径驱动的核心,主要是负责初始化一些数据结构,以及注册DeviceMapper的Target Type;而dm-path-selector.c是负责管理路径选择算法管理的库函数;dm-round-robin.c是必备的路径选择算法,在一条路径上完成指定的IO次数后就切换到下一条,不断循环。另外就是可选的两种路径选择算法,dm-service-time.c根据路径的吞吐量以及未完成的字节数选择负荷较轻的路径,dm-queue-length.c根据正在处理的IO个数较少的那个。

dm-mpath.c 分析

初始化

static struct target_type multipath_target = {
	.name = "multipath",
	.version = {1, 9, 0},
	.module = THIS_MODULE,
	.ctr = multipath_ctr,
	.dtr = multipath_dtr,
	.map_rq = multipath_map,
	.clone_and_map_rq = multipath_clone_and_map,
	.release_clone_rq = multipath_release_clone,
	.rq_end_io = multipath_end_io,
	.presuspend = multipath_presuspend,
	.postsuspend = multipath_postsuspend,
	.resume = multipath_resume,
	.status = multipath_status,
	.message = multipath_message,
	.ioctl  = multipath_ioctl,
	.iterate_devices = multipath_iterate_devices,
	.busy = multipath_busy,
};

static int __init dm_multipath_init(void)
{
	int r;

	/* allocate a slab for the dm_ios */
	_mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
	if (!_mpio_cache)
		return -ENOMEM;

	r = dm_register_target(&multipath_target);
	if (r < 0) {
		DMERR("register failed %d", r);
		r = -EINVAL;
		goto bad_register_target;
	}

	kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
	if (!kmultipathd) {
		DMERR("failed to create workqueue kmpathd");
		r = -ENOMEM;
		goto bad_alloc_kmultipathd;
	}

	/*
	 * A separate workqueue is used to handle the device handlers
	 * to avoid overloading existing workqueue. Overloading the
	 * old workqueue would also create a bottleneck in the
	 * path of the storage hardware device activation.
	 */
	kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
						  WQ_MEM_RECLAIM);
	if (!kmpath_handlerd) {
		DMERR("failed to create workqueue kmpath_handlerd");
		r = -ENOMEM;
		goto bad_alloc_kmpath_handlerd;
	}

	DMINFO("version %u.%u.%u loaded",
	       multipath_target.version[0], multipath_target.version[1],
	       multipath_target.version[2]);

	return 0;

bad_alloc_kmpath_handlerd:
	destroy_workqueue(kmultipathd);
bad_alloc_kmultipathd:
	dm_unregister_target(&multipath_target);
bad_register_target:
	kmem_cache_destroy(_mpio_cache);

	return r;
}

主要做了以下几件事:

  1. 创建一个slab缓冲区,用于后续多路径中需要快速申请和释放内存的地方使用;
  2. 向DeviceMapper框架中注册多路径的Target Type;
  3. 申请两个工作队列,用于异步执行耗时的IO操作。

构造实例

static int multipath_ctr(struct dm_target *ti, unsigned int argc,
			 char **argv)
{
	/* target arguments */
	static struct dm_arg _args[] = {
		{0, 1024, "invalid number of priority groups"},
		{0, 1024, "invalid initial priority group number"},
	};

	int r;
	struct multipath *m;
	struct dm_arg_set as;
	unsigned pg_count = 0;
	unsigned next_pg_num;

	as.argc = argc;
	as.argv = argv;

	m = alloc_multipath(ti);
	if (!m) {
		ti->error = "can't allocate multipath";
		return -EINVAL;
	}

	r = parse_features(&as, m);
	if (r)
		goto bad;

	r = parse_hw_handler(&as, m);
	if (r)
		goto bad;

	r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
	if (r)
		goto bad;

	r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
	if (r)
		goto bad;

	if ((!m->nr_priority_groups && next_pg_num) ||
	    (m->nr_priority_groups && !next_pg_num)) {
		ti->error = "invalid initial priority group";
		r = -EINVAL;
		goto bad;
	}

	/* parse the priority groups */
	while (as.argc) {
		struct priority_group *pg;

		pg = parse_priority_group(&as, m);
		if (IS_ERR(pg)) {
			r = PTR_ERR(pg);
			goto bad;
		}

		m->nr_valid_paths += pg->nr_pgpaths;
		list_add_tail(&pg->list, &m->priority_groups);
		pg_count++;
		pg->pg_num = pg_count;
		if (!--next_pg_num)
			m->next_pg = pg;
	}

	if (pg_count != m->nr_priority_groups) {
		ti->error = "priority group count mismatch";
		r = -EINVAL;
		goto bad;
	}

	ti->num_flush_bios = 1;
	ti->num_discard_bios = 1;
	ti->num_write_same_bios = 1;

	return 0;

 bad:
	free_multipath(m);
	return r;
}

当用户程序加载DeviceMapper映射表时,如果其中的映射类型为multipath,则会根据映射表中的参数调用此函数,以此构建一个Target Device,步骤为:

  1. 申请该实例内存,并进行简单初始化;
  2. 解析参数,并根据参数创建优先级组、并解析每个优先级组的路径算法;
  3. 设置一些基本的DeviceMapper属性。

一个设备的多条路径可以使用不同类型,划分分成多个优先组,每个组需要包含一个路径选择器(路径选择算法)以及至少一条路径。

参数组成部分

  1. 特性参数;
  2. 硬件参数;
  3. 优先级组以及路径参数。

参数解析实例

以上面的iscsi设备的映射表为例:

iscsi: 0 209715200 multipath 0 0 1 1 service-time 0 2 2 8:64 1 1 8:80 1 1

其中“0 0 1 1 service-time 0 2 2 8:64 1 1 8:80 1 1”为传递给驱动的参数。

0 : 特性参数个数为0,如果不为0,则后面会依次跟上pg_init_retries和pg_init_delay_msecs参数;

0 : 硬件参数个数为0,如果不为0,则后面会依次跟上相关硬件处理函数的参数;

1 : 优先级组个数;

1 : 下一个优先组序号,主要用于解析优先组时使用;

service-time : 当前优先组使用service-time算法;

0 : 算法的参数个数为0;

2 : 该优先组包含两条路径;

2 : 每条路径的参数有两个;

8:64 1 1 : 第一条路径的设备号为8:64,每进行1次IO就切换路径,路径的吞吐量权重值为1;

8:80 1 1 : 第二条路径的设备号为8:80,每进行1次IO就切换路径,路径的吞吐量权重值为1。

最终用户态程序看到的结果就是这样:

iscsi (360000000000000000e00000000010001) dm-0 IET     ,VIRTUAL-DISK    
size=100G features='0' hwhandler='0' wp=rw
`-+- policy='service-time 0' prio=1 status=active
  |- 3:0:0:1 sde 8:64 active ready running
  `- 4:0:0:1 sdf 8:80 active ready running

IO映射

基本映射步骤

static int multipath_map(struct dm_target *ti, struct request *clone,
			 union map_info *map_context)
{
	return __multipath_map(ti, clone, map_context, NULL, NULL);
}

static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
				   union map_info *map_context,
				   struct request **clone)
{
	return __multipath_map(ti, NULL, map_context, rq, clone);
}

static int __multipath_map(struct dm_target *ti, struct request *clone,
			   union map_info *map_context,
			   struct request *rq, struct request **__clone)
{
	struct multipath *m = (struct multipath *) ti->private;
	int r = DM_MAPIO_REQUEUE;
	size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq);
	struct pgpath *pgpath;
	struct block_device *bdev;
	struct dm_mpath_io *mpio;

	spin_lock_irq(&m->lock);

	/* Do we need to select a new pgpath? */
	if (!m->current_pgpath ||
	    (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
		__choose_pgpath(m, nr_bytes);

	pgpath = m->current_pgpath;

	if (!pgpath) {
		if (!__must_push_back(m))
			r = -EIO;	/* Failed */
		goto out_unlock;
	} else if (m->queue_io || m->pg_init_required) {
		__pg_init_all_paths(m);
		goto out_unlock;
	}

	if (set_mapinfo(m, map_context) < 0)
		/* ENOMEM, requeue */
		goto out_unlock;

	mpio = map_context->ptr;
	mpio->pgpath = pgpath;
	mpio->nr_bytes = nr_bytes;

	bdev = pgpath->path.dev->bdev;

	spin_unlock_irq(&m->lock);

	if (clone) {
		/* Old request-based interface: allocated clone is passed in */
		clone->q = bdev_get_queue(bdev);
		clone->rq_disk = bdev->bd_disk;
		clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
	} else {
		/* blk-mq request-based interface */
		*__clone = blk_get_request(bdev_get_queue(bdev),
					   rq_data_dir(rq), GFP_ATOMIC);
		if (IS_ERR(*__clone)) {
			/* ENOMEM, requeue */
			clear_mapinfo(m, map_context);
			return r;
		}
		(*__clone)->bio = (*__clone)->biotail = NULL;
		(*__clone)->rq_disk = bdev->bd_disk;
		(*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT;
	}

	if (pgpath->pg->ps.type->start_io)
		pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
					      &pgpath->path,
					      nr_bytes);
	return DM_MAPIO_REMAPPED;

out_unlock:
	spin_unlock_irq(&m->lock);

	return r;
}
  1. 判断是否需要切换路径,并获取当前可用路径,把路径信息保存在请求的私有数据中;
  2. 根据路径信息重定向请求,设置其目的队列;
  3. 调用路径线算法的start_io函数;
  4. 如果成功则返回DM_MAPIO_REMAPPED,表明映射成功,通知DM框架重新投递请求。

IO完成

DeviceMapper映射完成后,进行IO结束时会调用multipath_end_io函数进行资源回收,首先调用do_end_io结束IO,最后会清除在开始映射时创建的上下文信息,并调用路径选择算法的end_io函数。

static int multipath_end_io(struct dm_target *ti, struct request *clone,
			    int error, union map_info *map_context)
{
	struct multipath *m = ti->private;
	struct dm_mpath_io *mpio = map_context->ptr;
	struct pgpath *pgpath;
	struct path_selector *ps;
	int r;

	BUG_ON(!mpio);

	r  = do_end_io(m, clone, error, mpio);
	pgpath = mpio->pgpath;
	if (pgpath) {
		ps = &pgpath->pg->ps;
		if (ps->type->end_io)
			ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
	}
	clear_mapinfo(m, map_context);

	return r;
}

在do_end_io中,如果IO错误且pgpath不为空则使用fail_path函数通过uevent通知用户层此路径已经失效。如果设置了queue_if_no_path为0,则会返回DM_ENDIO_REQUEUE,DeviceMapper会把这个请求重新加入队列,否则直接返回-EIO错误。

static int do_end_io(struct multipath *m, struct request *clone,
		     int error, struct dm_mpath_io *mpio)
{
	/*
	 * We don't queue any clone request inside the multipath target
	 * during end I/O handling, since those clone requests don't have
	 * bio clones.  If we queue them inside the multipath target,
	 * we need to make bio clones, that requires memory allocation.
	 * (See drivers/md/dm.c:end_clone_bio() about why the clone requests
	 *  don't have bio clones.)
	 * Instead of queueing the clone request here, we queue the original
	 * request into dm core, which will remake a clone request and
	 * clone bios for it and resubmit it later.
	 */
	int r = DM_ENDIO_REQUEUE;
	unsigned long flags;

	if (!error && !clone->errors)
		return 0;	/* I/O complete */

	if (noretry_error(error))
		return error;

	if (mpio->pgpath)
		fail_path(mpio->pgpath);

	spin_lock_irqsave(&m->lock, flags);
	if (!m->nr_valid_paths) {
		if (!m->queue_if_no_path) {
			if (!__must_push_back(m))
				r = -EIO;
		} else {
			if (error == -EBADE)
				r = error;
		}
	}
	spin_unlock_irqrestore(&m->lock, flags);

	return r;
}

static int fail_path(struct pgpath *pgpath)
{
	unsigned long flags;
	struct multipath *m = pgpath->pg->m;

	spin_lock_irqsave(&m->lock, flags);

	if (!pgpath->is_active)
		goto out;

	DMWARN("Failing path %s.", pgpath->path.dev->name);

	pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
	pgpath->is_active = 0;
	pgpath->fail_count++;

	m->nr_valid_paths--;

	if (pgpath == m->current_pgpath)
		m->current_pgpath = NULL;

	dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
		      pgpath->path.dev->name, m->nr_valid_paths);

	schedule_work(&m->trigger_event);

out:
	spin_unlock_irqrestore(&m->lock, flags);

	return 0;
}

消息处理

static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
{
	int r = -EINVAL;
	struct dm_dev *dev;
	struct multipath *m = (struct multipath *) ti->private;
	action_fn action;

	mutex_lock(&m->work_mutex);

	if (dm_suspended(ti)) {
		r = -EBUSY;
		goto out;
	}

	if (argc == 1) {
		if (!strcasecmp(argv[0], "queue_if_no_path")) {
			r = queue_if_no_path(m, 1, 0);
			goto out;
		} else if (!strcasecmp(argv[0], "fail_if_no_path")) {
			r = queue_if_no_path(m, 0, 0);
			goto out;
		}
	}

	if (argc != 2) {
		DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc);
		goto out;
	}

	if (!strcasecmp(argv[0], "disable_group")) {
		r = bypass_pg_num(m, argv[1], 1);
		goto out;
	} else if (!strcasecmp(argv[0], "enable_group")) {
		r = bypass_pg_num(m, argv[1], 0);
		goto out;
	} else if (!strcasecmp(argv[0], "switch_group")) {
		r = switch_pg_num(m, argv[1]);
		goto out;
	} else if (!strcasecmp(argv[0], "reinstate_path"))
		action = reinstate_path;
	else if (!strcasecmp(argv[0], "fail_path"))
		action = fail_path;
	else {
		DMWARN("Unrecognised multipath message received: %s", argv[0]);
		goto out;
	}

	r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev);
	if (r) {
		DMWARN("message: error getting device %s",
		       argv[1]);
		goto out;
	}

	r = action_dev(m, dev, action);

	dm_put_device(ti, dev);

out:
	mutex_unlock(&m->work_mutex);
	return r;
}

多路径驱动除了根据在给定的算法和配置下进行IO错误时以外,并不主动探测路径的状态;增加减少路径、以及状态改变都是由用户态的multipathd和multipath来控制完成,然后通过DeviceMapper框架提供的接口,调用Target Type驱动的Message函数完成。这里实现了切换优先组,失效以及使能路径等功能。

路径选择算法分析

路径选择器管理函数库

在dm-path-selector.h 和 dm-path-selector.c中实现,使用双向链表实现对路径算法的管理功能,主要包括注册、取消注册、获取和释放功能。

/* Register a path selector */
int dm_register_path_selector(struct path_selector_type *type);

/* Unregister a path selector */
int dm_unregister_path_selector(struct path_selector_type *type);

/* Returns a registered path selector type */
struct path_selector_type *dm_get_path_selector(const char *name);

/* Releases a path selector  */
void dm_put_path_selector(struct path_selector_type *pst);

路径选择算法需要填充并注册如下结构:

struct path_selector_type {
	char *name;
	struct module *module;

	unsigned int table_args;
	unsigned int info_args;

	/*
	 * Constructs a path selector object, takes custom arguments
	 */
	int (*create) (struct path_selector *ps, unsigned argc, char **argv);
	void (*destroy) (struct path_selector *ps);

	/*
	 * Add an opaque path object, along with some selector specific
	 * path args (eg, path priority).
	 */
	int (*add_path) (struct path_selector *ps, struct dm_path *path,
			 int argc, char **argv, char **error);

	/*
	 * Chooses a path for this io, if no paths are available then
	 * NULL will be returned.
	 *
	 * repeat_count is the number of times to use the path before
	 * calling the function again.  0 means don't call it again unless
	 * the path fails.
	 */
	struct dm_path *(*select_path) (struct path_selector *ps,
					unsigned *repeat_count,
					size_t nr_bytes);

	/*
	 * Notify the selector that a path has failed.
	 */
	void (*fail_path) (struct path_selector *ps, struct dm_path *p);

	/*
	 * Ask selector to reinstate a path.
	 */
	int (*reinstate_path) (struct path_selector *ps, struct dm_path *p);

	/*

	/*
	 * Table content based on parameters added in ps_add_path_fn
	 * or path selector status
	 */
	int (*status) (struct path_selector *ps, struct dm_path *path,
		       status_type_t type, char *result, unsigned int maxlen);

	int (*start_io) (struct path_selector *ps, struct dm_path *path,
			 size_t nr_bytes);
	int (*end_io) (struct path_selector *ps, struct dm_path *path,
		       size_t nr_bytes);
};
  • create - 实例化一个选择器;
  • destroy - 销毁一个选择器;
  • add_path - 向该选择器中增加一条路径;
  • select_path - 选择进行IO的路径;
  • fail_path - 告诉选择器该路径失效;
  • reinstate_path - 告诉选择器该路径可用;
  • status - 返回选择器状态;
  • start_io - 使用者进行IO前必须调用该函数;
  • end_io - 使用者完成IO后必须调用该函数。

round-robin 路径选择器

添加路径

每条路径只有一个参数,就是每条路径执行多少次IO后切换路径。

static int rr_add_path(struct path_selector *ps, struct dm_path *path,
		       int argc, char **argv, char **error)
{
	struct selector *s = (struct selector *) ps->context;
	struct path_info *pi;
	unsigned repeat_count = RR_MIN_IO;
	char dummy;

	if (argc > 1) {
		*error = "round-robin ps: incorrect number of arguments";
		return -EINVAL;
	}

	/* First path argument is number of I/Os before switching path */
	if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
		*error = "round-robin ps: invalid repeat count";
		return -EINVAL;
	}

	/* allocate the path */
	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
	if (!pi) {
		*error = "round-robin ps: Error allocating path context";
		return -ENOMEM;
	}

	pi->path = path;
	pi->repeat_count = repeat_count;

	path->pscontext = pi;

	list_add_tail(&pi->list, &s->valid_paths);

	return 0;
}

路径选择策略

在每条路径向进行“repeat_count”次IO后就切换到下一条,不断循环。

static struct dm_path *rr_select_path(struct path_selector *ps, 
		unsigned *repeat_count, size_t nr_bytes)
{
	struct selector *s = (struct selector *) ps->context;
	struct path_info *pi = NULL;

	if (!list_empty(&s->valid_paths)) {
		pi = list_entry(s->valid_paths.next, struct path_info, list);
		list_move_tail(&pi->list, &s->valid_paths);
		*repeat_count = pi->repeat_count;
	}

	return pi ? pi->path : NULL;
}

service-time 路径选择器

添加路径

每条路径需要指定他的重复次数repeat_count,以吞吐量权重值relative_throughput。

static int st_add_path(struct path_selector *ps, struct dm_path *path,
		       int argc, char **argv, char **error)
{
	struct selector *s = ps->context;
	struct path_info *pi;
	unsigned repeat_count = ST_MIN_IO;
	unsigned relative_throughput = 1;
	char dummy;

	/*
	 * Arguments: [<repeat_count> [<relative_throughput>]]
	 * 	<repeat_count>: The number of I/Os before switching path.
	 * 			If not given, default (ST_MIN_IO) is used.
	 * 	<relative_throughput>: The relative throughput value of
	 *			the path among all paths in the path-group.
	 * 			The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
	 *			If not given, minimum value '1' is used.
	 *			If '0' is given, the path isn't selected while
	 * 			other paths having a positive value are
	 * 			available.
	 */
	if (argc > 2) {
		*error = "service-time ps: incorrect number of arguments";
		return -EINVAL;
	}

	if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
		*error = "service-time ps: invalid repeat count";
		return -EINVAL;
	}

	if ((argc == 2) &&
	    (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 ||
	     relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
		*error = "service-time ps: invalid relative_throughput value";
		return -EINVAL;
	}

	/* allocate the path */
	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
	if (!pi) {
		*error = "service-time ps: Error allocating path context";
		return -ENOMEM;
	}

	pi->path = path;
	pi->repeat_count = repeat_count;
	pi->relative_throughput = relative_throughput;
	atomic_set(&pi->in_flight_size, 0);

	path->pscontext = pi;

	list_add_tail(&pi->list, &s->valid_paths);

	return 0;
}

路径选择策略

在IO开始与结束时,分别增减该路径正在处理的IO字节数。

static int st_start_io(struct path_selector *ps, struct dm_path *path,
		       size_t nr_bytes)
{
	struct path_info *pi = path->pscontext;

	atomic_add(nr_bytes, &pi->in_flight_size);

	return 0;
}

static int st_end_io(struct path_selector *ps, struct dm_path *path,
		     size_t nr_bytes)
{
	struct path_info *pi = path->pscontext;

	atomic_sub(nr_bytes, &pi->in_flight_size);

	return 0;
}

在多个路径中,选择正在处理的数据量与吞吐量比值最小的那条路径。

/*
 * Compare the estimated service time of 2 paths, pi1 and pi2,
 * for the incoming I/O.
 *
 * Returns:
 * < 0 : pi1 is better
 * 0   : no difference between pi1 and pi2
 * > 0 : pi2 is better
 *
 * Description:
 * Basically, the service time is estimated by:
 *     ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
 * To reduce the calculation, some optimizations are made.
 * (See comments inline)
 */
static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
			   size_t incoming)
{
	size_t sz1, sz2, st1, st2;

	sz1 = atomic_read(&pi1->in_flight_size);
	sz2 = atomic_read(&pi2->in_flight_size);

	/*
	 * Case 1: Both have same throughput value. Choose less loaded path.
	 */
	if (pi1->relative_throughput == pi2->relative_throughput)
		return sz1 - sz2;

	/*
	 * Case 2a: Both have same load. Choose higher throughput path.
	 * Case 2b: One path has no throughput value. Choose the other one.
	 */
	if (sz1 == sz2 ||
	    !pi1->relative_throughput || !pi2->relative_throughput)
		return pi2->relative_throughput - pi1->relative_throughput;

	/*
	 * Case 3: Calculate service time. Choose faster path.
	 *         Service time using pi1:
	 *             st1 = (sz1 + incoming) / pi1->relative_throughput
	 *         Service time using pi2:
	 *             st2 = (sz2 + incoming) / pi2->relative_throughput
	 *
	 *         To avoid the division, transform the expression to use
	 *         multiplication.
	 *         Because ->relative_throughput > 0 here, if st1 < st2,
	 *         the expressions below are the same meaning:
	 *             (sz1 + incoming) / pi1->relative_throughput <
	 *                 (sz2 + incoming) / pi2->relative_throughput
	 *             (sz1 + incoming) * pi2->relative_throughput <
	 *                 (sz2 + incoming) * pi1->relative_throughput
	 *         So use the later one.
	 */
	sz1 += incoming;
	sz2 += incoming;
	if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
		     sz2 >= ST_MAX_INFLIGHT_SIZE)) {
		/*
		 * Size may be too big for multiplying pi->relative_throughput
		 * and overflow.
		 * To avoid the overflow and mis-selection, shift down both.
		 */
		sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
		sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
	}
	st1 = sz1 * pi2->relative_throughput;
	st2 = sz2 * pi1->relative_throughput;
	if (st1 != st2)
		return st1 - st2;

	/*
	 * Case 4: Service time is equal. Choose higher throughput path.
	 */
	return pi2->relative_throughput - pi1->relative_throughput;
}

static struct dm_path *st_select_path(struct path_selector *ps,
				      unsigned *repeat_count, size_t nr_bytes)
{
	struct selector *s = ps->context;
	struct path_info *pi = NULL, *best = NULL;

	if (list_empty(&s->valid_paths))
		return NULL;

	/* Change preferred (first in list) path to evenly balance. */
	list_move_tail(s->valid_paths.next, &s->valid_paths);

	list_for_each_entry(pi, &s->valid_paths, list)
		if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
			best = pi;

	if (!best)
		return NULL;

	*repeat_count = best->repeat_count;

	return best->path;
}

service-time 路径选择器

添加路径

只需要指定repeat_count即可。

static int ql_add_path(struct path_selector *ps, struct dm_path *path,
		       int argc, char **argv, char **error)
{
	struct selector *s = ps->context;
	struct path_info *pi;
	unsigned repeat_count = QL_MIN_IO;
	char dummy;

	/*
	 * Arguments: [<repeat_count>]
	 * 	<repeat_count>: The number of I/Os before switching path.
	 * 			If not given, default (QL_MIN_IO) is used.
	 */
	if (argc > 1) {
		*error = "queue-length ps: incorrect number of arguments";
		return -EINVAL;
	}

	if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
		*error = "queue-length ps: invalid repeat count";
		return -EINVAL;
	}

	/* Allocate the path information structure */
	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
	if (!pi) {
		*error = "queue-length ps: Error allocating path information";
		return -ENOMEM;
	}

	pi->path = path;
	pi->repeat_count = repeat_count;
	atomic_set(&pi->qlen, 0);

	path->pscontext = pi;

	list_add_tail(&pi->list, &s->valid_paths);

	return 0;
}

路径选择策略

在IO开始和结束时,分别增加正在处理的IO个数。

static int ql_start_io(struct path_selector *ps, struct dm_path *path,
		       size_t nr_bytes)
{
	struct path_info *pi = path->pscontext;

	atomic_inc(&pi->qlen);

	return 0;
}

static int ql_end_io(struct path_selector *ps, struct dm_path *path,
		     size_t nr_bytes)
{
	struct path_info *pi = path->pscontext;

	atomic_dec(&pi->qlen);

	return 0;
}

选择正在处理的IO个数最少的那条路径。

static struct dm_path *ql_select_path(struct path_selector *ps,
				      unsigned *repeat_count, size_t nr_bytes)
{
	struct selector *s = ps->context;
	struct path_info *pi = NULL, *best = NULL;

	if (list_empty(&s->valid_paths))
		return NULL;

	/* Change preferred (first in list) path to evenly balance. */
	list_move_tail(s->valid_paths.next, &s->valid_paths);

	list_for_each_entry(pi, &s->valid_paths, list) {
		if (!best ||
		    (atomic_read(&pi->qlen) < atomic_read(&best->qlen)))
			best = pi;

		if (!atomic_read(&best->qlen))
			break;
	}

	if (!best)
		return NULL;

	*repeat_count = best->repeat_count;

	return best->path;
}

用户层分析

多路径在用户层主要有:共享库libmultipath和libmpathpersist,检测插件,守护进程multipathd,工具multipath等。libmpathpersist和mpathpersist是完成SCSI的PR命令处理的库和工具,kpartx用于创建DeviceMapper虚拟设备,这里我们主要分析multipathd和multipath,并在需要的地方引入libmultipath的分析。

  • 路径发现 - 有两种方式,启动时通过扫描sys文件系统下的块设备并通过ioctl获取设备的属性,在运行时通过监听内核增删设备发出的uevent消息来获取设备信息。最后使用黑名单过滤之后,才会被加入路径列表中。
  • 路径检测 - 主要是根据设定的间隔时间定时读取0扇区来判断路径是否正常,还有就是驱动在IO错误时会通知应用层。
  • 路径切换 - 负载均衡切换由驱动根据算法调整,而failover,failback则由应用层检测后通知驱动层进行。另外切换时也可以配置切换方式(因存储不同而不同)。

multipathd

main函数在解析完参数后会调用child(multipathd/main.c)函数创建4个线程分别执行不同的任务:

static int child (void * param)
{
	pthread_t check_thr, uevent_thr, uxlsnr_thr, uevq_thr;
	pthread_attr_t log_attr, misc_attr, uevent_attr;
	struct vectors * vecs;
	struct multipath * mpp;
	int i;
	int rc, pid_rc;

	mlockall(MCL_CURRENT | MCL_FUTURE);
	sem_init(&exit_sem, 0, 0);
	signal_init();

	udev = udev_new();

	setup_thread_attr(&misc_attr, 64 * 1024, 1);
	setup_thread_attr(&uevent_attr, 128 * 1024, 1);
	setup_thread_attr(&waiter_attr, 32 * 1024, 1);

	if (logsink) {
		setup_thread_attr(&log_attr, 64 * 1024, 0);
		log_thread_start(&log_attr);
		pthread_attr_destroy(&log_attr);
	}

	running_state = DAEMON_START;

	condlog(2, "--------start up--------");
	condlog(2, "read " DEFAULT_CONFIGFILE);

	if (load_config(DEFAULT_CONFIGFILE, udev))
		exit(1);

	if (init_checkers()) {
		condlog(0, "failed to initialize checkers");
		exit(1);
	}
	if (init_prio()) {
		condlog(0, "failed to initialize prioritizers");
		exit(1);
	}

	setlogmask(LOG_UPTO(conf->verbosity + 3));

	if (conf->max_fds) {
		struct rlimit fd_limit;

		if (getrlimit(RLIMIT_NOFILE, &fd_limit) < 0) {
			condlog(0, "can't get open fds limit: %s",
				strerror(errno));
			fd_limit.rlim_cur = 0;
			fd_limit.rlim_max = 0;
		}
		if (fd_limit.rlim_cur < conf->max_fds) {
			fd_limit.rlim_cur = conf->max_fds;
			if (fd_limit.rlim_max < conf->max_fds)
				fd_limit.rlim_max = conf->max_fds;
			if (setrlimit(RLIMIT_NOFILE, &fd_limit) < 0) {
				condlog(0, "can't set open fds limit to "
					"%lu/%lu : %s",
					fd_limit.rlim_cur, fd_limit.rlim_max,
					strerror(errno));
			} else {
				condlog(3, "set open fds limit to %lu/%lu",
					fd_limit.rlim_cur, fd_limit.rlim_max);
			}
		}

	}

	vecs = gvecs = init_vecs();
	if (!vecs)
		exit(1);

	setscheduler();
	set_oom_adj();

	conf->daemon = 1;
	udev_set_sync_support(0);
	/*
	 * Start uevent listener early to catch events
	 */
	if ((rc = pthread_create(&uevent_thr, &uevent_attr, ueventloop, udev))) {
		condlog(0, "failed to create uevent thread: %d", rc);
		exit(1);
	}
	pthread_attr_destroy(&uevent_attr);
	if ((rc = pthread_create(&uxlsnr_thr, &misc_attr, uxlsnrloop, vecs))) {
		condlog(0, "failed to create cli listener: %d", rc);
		exit(1);
	}
	/*
	 * fetch and configure both paths and multipaths
	 */
	running_state = DAEMON_CONFIGURE;

	lock(vecs->lock);
	if (configure(vecs, 1)) {
		unlock(vecs->lock);
		condlog(0, "failure during configuration");
		exit(1);
	}
	unlock(vecs->lock);

	/*
	 * start threads
	 */
	if ((rc = pthread_create(&check_thr, &misc_attr, checkerloop, vecs))) {
		condlog(0,"failed to create checker loop thread: %d", rc);
		exit(1);
	}
	if ((rc = pthread_create(&uevq_thr, &misc_attr, uevqloop, vecs))) {
		condlog(0, "failed to create uevent dispatcher: %d", rc);
		exit(1);
	}
	pthread_attr_destroy(&misc_attr);

	/* Startup complete, create logfile */
	pid_rc = pidfile_create(DEFAULT_PIDFILE, daemon_pid);
	update_timestamp(1);
	/* Ignore errors, we can live without */

	running_state = DAEMON_RUNNING;

	/*
	 * exit path
	 */
	while(sem_wait(&exit_sem) != 0); /* Do nothing */
	running_state = DAEMON_SHUTDOWN;
	lock(vecs->lock);
	if (conf->queue_without_daemon == QUE_NO_DAEMON_OFF)
		vector_foreach_slot(vecs->mpvec, mpp, i)
			dm_queue_if_no_path(mpp->alias, 0);
	remove_maps_and_stop_waiters(vecs);
	unlock(vecs->lock);

	pthread_cancel(check_thr);
	pthread_cancel(uevent_thr);
	pthread_cancel(uxlsnr_thr);
	pthread_cancel(uevq_thr);

	lock(vecs->lock);
	free_pathvec(vecs->pathvec, FREE_PATHS);
	vecs->pathvec = NULL;
	unlock(vecs->lock);
	/* Now all the waitevent threads will start rushing in. */
	/* freeing vecs isn't worth the races
	while (vecs->lock.depth > 0) {
		sleep (1);
		condlog(3, "Have %d wait event checkers threads to de-alloc,"
			" waiting...", vecs->lock.depth);
	}
	pthread_mutex_destroy(vecs->lock.mutex);
	FREE(vecs->lock.mutex);
	vecs->lock.depth = 0;
	vecs->lock.mutex = NULL;
	FREE(vecs);
	vecs = NULL;
	*/
	cleanup_checkers();
	cleanup_prio();

	dm_lib_release();
	dm_lib_exit();

	/* We're done here */
	if (!pid_rc) {
		condlog(3, "unlink pidfile");
		unlink(DEFAULT_PIDFILE);
		unlink(DEFAULT_TIMESTAMP_FILE);
	}

	condlog(2, "--------shut down-------");

	if (logsink)
		log_thread_stop();

	/*
	 * Freeing config must be done after condlog() and dm_lib_exit(),
	 * because logging functions like dlog() and dm_write_log()
	 * reference the config.
	 */
	free_config(conf);
	conf = NULL;
	udev_unref(udev);
	udev = NULL;
#ifdef _DEBUG_
	dbg_free_final(NULL);
#endif

	exit(0);
}

uevent监听线程

static void *
ueventloop (void * ap)
{
	if (uevent_listen(udev))
		condlog(0, "error starting uevent listener");

	return NULL;
}

int uevent_listen(struct udev *udev)
{
	int err = 2;
	struct udev_monitor *monitor = NULL;
	int fd, socket_flags;
	int need_failback = 1;
	/*
	 * Queue uevents for service by dedicated thread so that the uevent
	 * listening thread does not block on multipathd locks (vecs->lock)
	 * thereby not getting to empty the socket's receive buffer queue
	 * often enough.
	 */
	if (!udev) {
		condlog(1, "no udev context");
		return 1;
	}
	udev_ref(udev);
	pthread_cleanup_push(uevq_stop, udev);

	monitor = udev_monitor_new_from_netlink(udev, "udev");
	if (!monitor) {
		condlog(2, "failed to create udev monitor");
		goto out;
	}
#ifdef LIBUDEV_API_RECVBUF
	if (udev_monitor_set_receive_buffer_size(monitor, 128 * 1024 * 1024))
		condlog(2, "failed to increase buffer size");
#endif
	fd = udev_monitor_get_fd(monitor);
	if (fd < 0) {
		condlog(2, "failed to get monitor fd");
		goto out;
	}
	socket_flags = fcntl(fd, F_GETFL);
	if (socket_flags < 0) {
		condlog(2, "failed to get monitor socket flags : %s",
			strerror(errno));
		goto out;
	}
	if (fcntl(fd, F_SETFL, socket_flags & ~O_NONBLOCK) < 0) {
		condlog(2, "failed to set monitor socket flags : %s",
			strerror(errno));
		goto out;
	}
	err = udev_monitor_filter_add_match_subsystem_devtype(monitor, "block",
							      NULL);
	if (err)
		condlog(2, "failed to create filter : %s", strerror(-err));
	err = udev_monitor_enable_receiving(monitor);
	if (err) {
		condlog(2, "failed to enable receiving : %s", strerror(-err));
		goto out;
	}
	while (1) {
		int i = 0;
		char *pos, *end;
		struct uevent *uev;
		struct udev_device *dev;
                struct udev_list_entry *list_entry;

		dev = udev_monitor_receive_device(monitor);
		if (!dev) {
			condlog(0, "failed getting udev device");
			continue;
		}

		uev = alloc_uevent();
		if (!uev) {
			udev_device_unref(dev);
			condlog(1, "lost uevent, oom");
			continue;
		}
		pos = uev->buffer;
		end = pos + HOTPLUG_BUFFER_SIZE + OBJECT_SIZE - 1;
		udev_list_entry_foreach(list_entry, udev_device_get_properties_list_entry(dev)) {
			const char *name, *value;
			int bytes;

			name = udev_list_entry_get_name(list_entry);
			if (!name)
				name = "(null)";
			value = udev_list_entry_get_value(list_entry);
			if (!value)
				value = "(null)";
			bytes = snprintf(pos, end - pos, "%s=%s", name,
					value);
			if (pos + bytes >= end) {
				condlog(2, "buffer overflow for uevent");
				break;
			}
			uev->envp[i] = pos;
			pos += bytes;
			*pos = '\0';
			pos++;
			if (strcmp(name, "DEVPATH") == 0)
				uev->devpath = uev->envp[i] + 8;
			if (strcmp(name, "ACTION") == 0)
				uev->action = uev->envp[i] + 7;
			i++;
			if (i == HOTPLUG_NUM_ENVP - 1)
				break;
		}
		uev->udev = dev;
		uev->envp[i] = NULL;

		condlog(3, "uevent '%s' from '%s'", uev->action, uev->devpath);
		uev->kernel = strrchr(uev->devpath, '/');
		if (uev->kernel)
			uev->kernel++;

		/* print payload environment */
		for (i = 0; uev->envp[i] != NULL; i++)
			condlog(5, "%s", uev->envp[i]);

		/*
 		 * Queue uevent and poke service pthread.
 		 */
		pthread_mutex_lock(uevq_lockp);
		list_add_tail(&uev->node, &uevq);
		pthread_cond_signal(uev_condp);
		pthread_mutex_unlock(uevq_lockp);
	}
	need_failback = 0;
out:
	if (monitor)
		udev_monitor_unref(monitor);
	if (need_failback)
		err = failback_listen();
	pthread_cleanup_pop(1);
	return err;
}

通过监听udev的信息,解析设备的添加删除信息,最后把它加入到服务线程的队列。

用户交互线程

static void *uxlsnrloop (void * ap)
{
	if (cli_init())
		return NULL;

	set_handler_callback(LIST+PATHS, cli_list_paths);
	set_handler_callback(LIST+PATHS+FMT, cli_list_paths_fmt);
	set_handler_callback(LIST+MAPS, cli_list_maps);
	set_handler_callback(LIST+STATUS, cli_list_status);
	set_handler_callback(LIST+DAEMON, cli_list_daemon);
	set_handler_callback(LIST+MAPS+STATUS, cli_list_maps_status);
	set_handler_callback(LIST+MAPS+STATS, cli_list_maps_stats);
	set_handler_callback(LIST+MAPS+FMT, cli_list_maps_fmt);
	set_handler_callback(LIST+MAPS+TOPOLOGY, cli_list_maps_topology);
	set_handler_callback(LIST+TOPOLOGY, cli_list_maps_topology);
	set_handler_callback(LIST+MAP+TOPOLOGY, cli_list_map_topology);
	set_handler_callback(LIST+CONFIG, cli_list_config);
	set_handler_callback(LIST+BLACKLIST, cli_list_blacklist);
	set_handler_callback(LIST+DEVICES, cli_list_devices);
	set_handler_callback(LIST+WILDCARDS, cli_list_wildcards);
	set_handler_callback(ADD+PATH, cli_add_path);
	set_handler_callback(DEL+PATH, cli_del_path);
	set_handler_callback(ADD+MAP, cli_add_map);
	set_handler_callback(DEL+MAP, cli_del_map);
	set_handler_callback(SWITCH+MAP+GROUP, cli_switch_group);
	set_handler_callback(RECONFIGURE, cli_reconfigure);
	set_handler_callback(SUSPEND+MAP, cli_suspend);
	set_handler_callback(RESUME+MAP, cli_resume);
	set_handler_callback(RESIZE+MAP, cli_resize);
	set_handler_callback(RELOAD+MAP, cli_reload);
	set_handler_callback(RESET+MAP, cli_reassign);
	set_handler_callback(REINSTATE+PATH, cli_reinstate);
	set_handler_callback(FAIL+PATH, cli_fail);
	set_handler_callback(DISABLEQ+MAP, cli_disable_queueing);
	set_handler_callback(RESTOREQ+MAP, cli_restore_queueing);
	set_handler_callback(DISABLEQ+MAPS, cli_disable_all_queueing);
	set_handler_callback(RESTOREQ+MAPS, cli_restore_all_queueing);
	set_handler_callback(QUIT, cli_quit);
	set_handler_callback(SHUTDOWN, cli_shutdown);
	set_handler_callback(GETPRSTATUS+MAP, cli_getprstatus);
	set_handler_callback(SETPRSTATUS+MAP, cli_setprstatus);
	set_handler_callback(UNSETPRSTATUS+MAP, cli_unsetprstatus);
	set_handler_callback(FORCEQ+DAEMON, cli_force_no_daemon_q);
	set_handler_callback(RESTOREQ+DAEMON, cli_restore_no_daemon_q);

	umask(077);
	uxsock_listen(&uxsock_trigger, ap);

	return NULL;
}

当使用-k参数启动multipathd时,就可以在控制台上与multipathd进行直接交互。

路径检查线程

static void *checkerloop (void *ap)
{
	struct vectors *vecs;
	struct path *pp;
	int count = 0;
	unsigned int i;

	mlockall(MCL_CURRENT | MCL_FUTURE);
	vecs = (struct vectors *)ap;
	condlog(2, "path checkers start up");

	/*
	 * init the path check interval
	 */
	vector_foreach_slot (vecs->pathvec, pp, i) {
		pp->checkint = conf->checkint;
	}

	while (1) {
		pthread_cleanup_push(cleanup_lock, &vecs->lock);
		lock(vecs->lock);
		pthread_testcancel();
		condlog(4, "tick");

		if (vecs->pathvec) {
			vector_foreach_slot (vecs->pathvec, pp, i) {
				check_path(vecs, pp);
			}
		}
		if (vecs->mpvec) {
			defered_failback_tick(vecs->mpvec);
			retry_count_tick(vecs->mpvec);
			missing_uev_message_tick(vecs->mpvec);
		}
		if (count)
			count--;
		else {
			condlog(4, "map garbage collection");
			mpvec_garbage_collector(vecs);
			count = MAPGCINT;
		}

		lock_cleanup_pop(vecs->lock);
		sleep(1);
	}
	return NULL;
}

遍历路径列表,代用check_path函数,使用配置的检查方法检查路径,根据检测结果改变路径状态,并通知内核。

void check_path (struct vectors * vecs, struct path * pp)
{
	int newstate;
	int new_path_up = 0;
	int chkr_new_path_up = 0;
	int oldchkrstate = pp->chkrstate;

	if (!pp->mpp && (pp->missing_udev_info != INFO_MISSING ||
			 pp->retriggers >= conf->retrigger_tries))
		return;

	if (pp->tick && --pp->tick)
		return; /* don't check this path yet */

	if (!pp->mpp) {
		pp->missing_udev_info = INFO_REQUESTED;
		pp->retriggers++;
		sysfs_attr_set_value(pp->udev, "uevent", "change",
				     strlen("change"));
		return;
	}

	/*
	 * provision a next check soonest,
	 * in case we exit abnormaly from here
	 */
	pp->tick = conf->checkint;

	newstate = path_offline(pp);
	if (newstate == PATH_UP)
		newstate = get_state(pp, 1);
	else
		checker_clear_message(&pp->checker);

	if (newstate == PATH_WILD || newstate == PATH_UNCHECKED) {
		condlog(2, "%s: unusable path", pp->dev);
		pathinfo(pp, conf->hwtable, 0);
		return;
	}
	/*
	 * Async IO in flight. Keep the previous path state
	 * and reschedule as soon as possible
	 */
	if (newstate == PATH_PENDING) {
		pp->tick = 1;
		return;
	}
	/*
	 * Synchronize with kernel state
	 */
	if (update_multipath_strings(pp->mpp, vecs->pathvec)) {
		condlog(1, "%s: Could not synchronize with kernel state",
			pp->dev);
		pp->dmstate = PSTATE_UNDEF;
	}
	/* if update_multipath_strings orphaned the path, quit early */
	if (!pp->mpp)
		return;

	if ((newstate == PATH_UP || newstate == PATH_GHOST) &&
	     pp->wait_checks > 0) {
		if (pp->mpp && pp->mpp->nr_active > 0) {
			pp->state = PATH_DELAYED;
			pp->wait_checks--;
			return;
		} else
			pp->wait_checks = 0;
	}

	pp->chkrstate = newstate;
	if (newstate != pp->state) {
		int oldstate = pp->state;
		pp->state = newstate;
		LOG_MSG(1, checker_message(&pp->checker));

		/*
		 * upon state change, reset the checkint
		 * to the shortest delay
		 */
		pp->checkint = conf->checkint;

		if (newstate == PATH_DOWN || newstate == PATH_SHAKY) {
			/*
			 * proactively fail path in the DM
			 */
			if (oldstate == PATH_UP ||
			    oldstate == PATH_GHOST) {
				fail_path(pp, 1);
				if (pp->mpp->delay_wait_checks > 0 &&
				    pp->watch_checks > 0) {
					pp->wait_checks = pp->mpp->delay_wait_checks;
					pp->watch_checks = 0;
				}
			}else
				fail_path(pp, 0);

			/*
			 * cancel scheduled failback
			 */
			pp->mpp->failback_tick = 0;

			pp->mpp->stat_path_failures++;
			return;
		}

		if(newstate == PATH_UP || newstate == PATH_GHOST){
		 	/*
			 * Reinitialize the prioritizer, in case something
		 	 * changed.
		 	 */
			prio_init(&pp->prio);
			if ( pp->mpp && pp->mpp->prflag ){
				/*
				 * Check Persistent Reservation.
				 */
			condlog(2, "%s: checking persistent reservation "
				"registration", pp->dev);
			mpath_pr_event_handle(pp);
			}
		}

		/*
		 * reinstate this path
		 */
		if (oldstate != PATH_UP &&
		    oldstate != PATH_GHOST) {
			if (pp->mpp->delay_watch_checks > 0)
				pp->watch_checks = pp->mpp->delay_watch_checks;
			reinstate_path(pp, 1);
		} else {
			if (pp->watch_checks > 0)
				pp->watch_checks--;
			reinstate_path(pp, 0);
		}
		new_path_up = 1;

		if (oldchkrstate != PATH_UP && oldchkrstate != PATH_GHOST)
			chkr_new_path_up = 1;

		/*
		 * if at least one path is up in a group, and
		 * the group is disabled, re-enable it
		 */
		if (newstate == PATH_UP)
			enable_group(pp);
	}
	else if (newstate == PATH_UP || newstate == PATH_GHOST) {
		if (pp->dmstate == PSTATE_FAILED ||
		    pp->dmstate == PSTATE_UNDEF) {
			/* Clear IO errors */
			reinstate_path(pp, 0);
		} else {
			LOG_MSG(4, checker_message(&pp->checker));
			if (pp->checkint != conf->max_checkint) {
				/*
				 * double the next check delay.
				 * max at conf->max_checkint
				 */
				if (pp->checkint < (conf->max_checkint / 2))
					pp->checkint = 2 * pp->checkint;
				else
					pp->checkint = conf->max_checkint;
			}
			if (pp->watch_checks > 0)
				pp->watch_checks--;
			pp->tick = pp->checkint;
			condlog(4, "%s: delay next check %is",
				pp->dev_t, pp->tick);
		}
	}
	else if (newstate == PATH_DOWN) {
		if (conf->log_checker_err == LOG_CHKR_ERR_ONCE)
			LOG_MSG(3, checker_message(&pp->checker));
		else
			LOG_MSG(2, checker_message(&pp->checker));
	}

	pp->state = newstate;


	if (pp->mpp->wait_for_udev)
		return;
	/*
	 * path prio refreshing
	 */
	condlog(4, "path prio refresh");

	if (update_prio(pp, new_path_up) &&
	    (pp->mpp->pgpolicyfn == (pgpolicyfn *)group_by_prio) &&
	     pp->mpp->pgfailback == -FAILBACK_IMMEDIATE)
		update_path_groups(pp->mpp, vecs, !new_path_up);
	else if (need_switch_pathgroup(pp->mpp, 0)) {
		if (pp->mpp->pgfailback > 0 &&
		    (new_path_up || pp->mpp->failback_tick <= 0))
			pp->mpp->failback_tick =
				pp->mpp->pgfailback + 1;
		else if (pp->mpp->pgfailback == -FAILBACK_IMMEDIATE ||
			 (chkr_new_path_up && followover_should_failback(pp)))
			switch_pathgroup(pp->mpp);
	}
}

服务线程

static void *uevqloop (void * ap)
{
	if (uevent_dispatch(&uev_trigger, ap))
		condlog(0, "error starting uevent dispatcher");

	return NULL;
}

/*
 * Service the uevent queue.
 */
int uevent_dispatch(int (*uev_trigger)(struct uevent *, void * trigger_data),
		    void * trigger_data)
{
	my_uev_trigger = uev_trigger;
	my_trigger_data = trigger_data;

	mlockall(MCL_CURRENT | MCL_FUTURE);

	while (1) {
		LIST_HEAD(uevq_tmp);

		pthread_mutex_lock(uevq_lockp);
		servicing_uev = 0;
		/*
		 * Condition signals are unreliable,
		 * so make sure we only wait if we have to.
		 */
		if (list_empty(&uevq)) {
			pthread_cond_wait(uev_condp, uevq_lockp);
		}
		servicing_uev = 1;
		list_splice_init(&uevq, &uevq_tmp);
		pthread_mutex_unlock(uevq_lockp);
		if (!my_uev_trigger)
			break;
		service_uevq(&uevq_tmp);
	}
	condlog(3, "Terminating uev service queue");
	uevq_cleanup(&uevq);
	return 0;
}

/*
 * Called with uevq_lockp held
 */
void
service_uevq(struct list_head *tmpq)
{
	struct uevent *uev, *tmp;

	list_for_each_entry_safe(uev, tmp, tmpq, node) {
		list_del_init(&uev->node);

		if (my_uev_trigger && my_uev_trigger(uev, my_trigger_data))
			condlog(0, "uevent trigger error");

		if (uev->udev)
			udev_device_unref(uev->udev);
		FREE(uev);
	}
}

multipath

查看路径(multipath -l/-ll)

static int get_dm_mpvec (vector curmp, vector pathvec, char * refwwid)
{
	int i;
	struct multipath * mpp;
	char params[PARAMS_SIZE], status[PARAMS_SIZE];

	if (dm_get_maps(curmp))
		return 1;

	vector_foreach_slot (curmp, mpp, i) {
		/*
		 * discard out of scope maps
		 */
		if (mpp->wwid && refwwid &&
		    strncmp(mpp->wwid, refwwid, WWID_SIZE)) {
			condlog(3, "skip map %s: out of scope", mpp->alias);
			free_multipath(mpp, KEEP_PATHS);
			vector_del_slot(curmp, i);
			i--;
			continue;
		}

		if (conf->cmd == CMD_VALID_PATH)
			continue;

		dm_get_map(mpp->alias, &mpp->size, params);
		condlog(3, "params = %s", params);
		dm_get_status(mpp->alias, status);
		condlog(3, "status = %s", status);

		disassemble_map(pathvec, params, mpp);

		/*
		 * disassemble_map() can add new paths to pathvec.
		 * If not in "fast list mode", we need to fetch information
		 * about them
		 */
		if (conf->cmd != CMD_LIST_SHORT)
			update_paths(mpp);

		if (conf->cmd == CMD_LIST_LONG)
			mpp->bestpg = select_path_group(mpp);

		disassemble_status(status, mpp);

		if (conf->cmd == CMD_LIST_SHORT ||
		    conf->cmd == CMD_LIST_LONG)
			print_multipath_topology(mpp, conf->verbosity);

		if (conf->cmd == CMD_CREATE)
			reinstate_paths(mpp);
	}
	return 0;
}

int dm_get_maps (vector mp)
{
	struct multipath * mpp;
	int r = 1;
	int info;
	struct dm_task *dmt;
	struct dm_names *names;
	unsigned next = 0;

	if (!mp)
		return 1;

	if (!(dmt = dm_task_create(DM_DEVICE_LIST)))
		return 1;

	dm_task_no_open_count(dmt);

	if (!dm_task_run(dmt))
		goto out;

	if (!(names = dm_task_get_names(dmt)))
		goto out;

	if (!names->dev) {
		r = 0; /* this is perfectly valid */
		goto out;
	}

	do {
		info = dm_type(names->name, TGT_MPATH);

		if (info <= 0)
			goto next;

		mpp = alloc_multipath();

		if (!mpp)
			goto out;

		mpp->alias = STRDUP(names->name);

		if (!mpp->alias)
			goto out1;

		if (info > 0) {
			if (dm_get_map(names->name, &mpp->size, NULL))
				goto out1;

			dm_get_uuid(names->name, mpp->wwid);
			dm_get_info(names->name, &mpp->dmi);
		}

		if (!vector_alloc_slot(mp))
			goto out1;

		vector_set_slot(mp, mpp);
		mpp = NULL;
next:
		next = names->next;
		names = (void *) names + next;
	} while (next);

	r = 0;
	goto out;
out1:
	free_multipath(mpp, KEEP_PATHS);
out:
	dm_task_destroy (dmt);
	return r;
}

extern void path_group_prio_update (struct pathgroup * pgp)
{
	int i;
	int priority = 0;
	struct path * pp;

	pgp->enabled_paths = 0;
	if (!pgp->paths) {
		pgp->priority = 0;
		return;
	}
	vector_foreach_slot (pgp->paths, pp, i) {
		if (pp->state == PATH_UP ||
		    pp->state == PATH_GHOST) {
			priority += pp->priority;
			pgp->enabled_paths++;
		}
	}
	if (pgp->enabled_paths)
		pgp->priority = priority / pgp->enabled_paths;
	else
		pgp->priority = 0;
}

extern int select_path_group (struct multipath * mpp)
{
	int i;
	int max_priority = 0;
	int bestpg = 1;
	int max_enabled_paths = 1;
	struct pathgroup * pgp;

	if (!mpp->pg)
		return 1;

	vector_foreach_slot (mpp->pg, pgp, i) {
		if (!pgp->paths)
			continue;

		path_group_prio_update(pgp);
		if (pgp->enabled_paths) {
			if (pgp->priority > max_priority) {
				max_priority = pgp->priority;
				max_enabled_paths = pgp->enabled_paths;
				bestpg = i + 1;
			} else if (pgp->priority == max_priority) {
				if (pgp->enabled_paths > max_enabled_paths) {
					max_enabled_paths = pgp->enabled_paths;
					bestpg = i + 1;
				}
			}
		}
	}
	return bestpg;

清空全部路径(multipath -F)

extern int dm_flush_maps (void)
{
	int r = 0;
	struct dm_task *dmt;
	struct dm_names *names;
	unsigned next = 0;

	if (!(dmt = dm_task_create (DM_DEVICE_LIST)))
		return 0;

	dm_task_no_open_count(dmt);

	if (!dm_task_run (dmt))
		goto out;

	if (!(names = dm_task_get_names (dmt)))
		goto out;

	if (!names->dev)
		goto out;

	do {
		r |= dm_suspend_and_flush_map(names->name);
		next = names->next;
		names = (void *) names + next;
	} while (next);

	out:
	dm_task_destroy (dmt);
	return r;
}

extern int dm_suspend_and_flush_map (const char * mapname)
{
	int s = 0, queue_if_no_path = 0;
	unsigned long long mapsize;
	char params[PARAMS_SIZE] = {0};

	if (!dm_map_present(mapname))
		return 0;

	if (dm_type(mapname, TGT_MPATH) <= 0)
		return 0; /* nothing to do */

	if (!dm_get_map(mapname, &mapsize, params)) {
		if (strstr(params, "queue_if_no_path"))
			queue_if_no_path = 1;
	}

	if (queue_if_no_path)
		s = dm_queue_if_no_path((char *)mapname, 0);
	/* Leave queue_if_no_path alone if unset failed */
	if (s)
		queue_if_no_path = 0;
	else
		s = dm_simplecmd_flush(DM_DEVICE_SUSPEND, mapname, 0, 0);

	if (!dm_flush_map(mapname)) {
		condlog(4, "multipath map %s removed", mapname);
		return 0;
	}
	condlog(2, "failed to remove multipath map %s", mapname);
	dm_simplecmd_noflush(DM_DEVICE_RESUME, mapname, 0);
	if (queue_if_no_path)
		s = dm_queue_if_no_path((char *)mapname, 1);
	return 1;
}

从路径列表中全出全部路径名称,并使用libdevicvemapper库函数删除虚拟多路径设备。