linux kobject kobject_type kset

原创

JDSH0224 2022-09-19 13:45:55 博主文章分类：linux driver ©著作权

文章标签 内核对象 linux 链表 文章分类 运维

©著作权归作者所有：来自51CTO博客作者JDSH0224的原创作品，请联系作者获取转载授权，否则将追究法律责任

前言：

linux设备模型包括kobject、kobject_type、kset。

为了更好的理解，举个栗子：kset就像一张蜘蛛网，而在网上有许多连线的交点，这些交点就是内核对象kobject，只是kset具备内核对象kobject的归类！

1. kobject内核对象

功能：

路径：linux-3.10.x\include\linux\kobject.h

struct kobject {
  const char    *name; //表示kobject对象的名字，对应sysfs下的一个目录,如/sys/devices/
  struct list_head  entry; //是kobject中插入的head_list结构,与所属的kset（list成员）组成链表
  struct kobject    *parent; //父kobject;此成员未指定时，默认指向所属kset的kobject成员；在/sys文件系统中表示目录的上一层
  struct kset    *kset; //表示当前kobject对象所属的集合,可以为NULL
  struct kobj_type  *ktype; //表示当前kobject的类型,提供操作kobject属性特征（attribute)的接口
  struct sysfs_dirent  *sd; //表示VFS文件系统的目录项，是设备与文件之间的桥梁，sysfs中的符号链接就是通过kernfs_node内的联合体实现的
  struct kref    kref; //对kobject的引用计数，当引用计数为0时，就回调之前注册的release方法释放该对象
  unsigned int state_initialized:1; //初始化标志位，在对象初始化时被置位，表示对象是否已经被初始化
  unsigned int state_in_sysfs:1; //表示kobject对象在sysfs中的状态，在对应目录中被创建则置1，否则为0
  unsigned int state_add_uevent_sent:1; //是添加设备的uevent事件是否发送标志，添加设备时会向用户空间发送uevent事件，请求新增设备
  unsigned int state_remove_uevent_sent:1;//是删除设备的uevent事件是否发送标志，删除设备时会向用户空间发送uevent事件，请求卸载设备
  unsigned int uevent_suppress:1;
};

2. kobject_type内核对象类型

功能：

路径：linux-3.10.x\include\linux\kobject.h

struct kobj_type {
  void (*release)(struct kobject *kobj); //是一个释放kobject对象的接口，有点像面向对象中的析构
  const struct sysfs_ops *sysfs_ops; //是操作kobject的方法集
  struct attribute **default_attrs; //文件属性
  const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj);
  const void *(*namespace)(struct kobject *kobj);
};

3. kset内核对象集合

功能：就是内核对象kobject的集合，如下图devices内核集合，在devices目录下是一些对象，包括platform平台对象

路径：linux-3.10.x\include\linux\kobject.h

struct kset {
  struct list_head list; //与子kobject的entry成员组成链表,还是那个用来挂在链表上的结构，包含在一个kset的所有kobject构成了一个双向循环链表，list_head就是这个链表的头部，这个链表用来连接第一个和最后一个kobject对象，第一个kobjetc使用entry连接kset集合以及第二个kobject对象，第二个kobject对象使用entry连接第一个kobject对象和第三个kobject对象，依次类推，最终形成一个kobject对象的链表
  spinlock_t list_lock; 
  struct kobject kobj; //是归属于该kset的所有的kobject的共有parent,这个parent就是体现内核设备组织结构的关键，同时，kset的引用计数就是内嵌的kobject对象的引用次数
  const struct kset_uevent_ops *uevent_ops; //表示通知用户层事件
};

4. 如何创建内核集合kset，内核对象kobject与内核集合kset的关系

功能：内核在启动时初始化设备集合

路径：linux-3.10.x\drivers\base\core.c

int __init devices_init(void)
{
  //创建devices kset集合，即为/sys/devices
  devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL); 
  if (!devices_kset)
    return -ENOMEM;
  //创建dev对象,即为/sys/dev
  dev_kobj = kobject_create_and_add("dev", NULL); 
  if (!dev_kobj)
    goto dev_kobj_err;
  //在父对象dev_kobj对象下创建block块内核对象，即为/sys/dev/block
  sysfs_dev_block_kobj = kobject_create_and_add("block", dev_kobj); 
  if (!sysfs_dev_block_kobj)
    goto block_kobj_err;
  //在父对象dev_kobj对象下创建char字符内核对象，即为/sys/dev/char
  sysfs_dev_char_kobj = kobject_create_and_add("char", dev_kobj); 
  if (!sysfs_dev_char_kobj)
    goto char_kobj_err;

  return 0;

 char_kobj_err:
  kobject_put(sysfs_dev_block_kobj);
 block_kobj_err:
  kobject_put(dev_kobj);
 dev_kobj_err:
  kset_unregister(devices_kset);
  return -ENOMEM;
}

上面devices_kset对象集合与所有的驱动都有关联，我们搜索device_initialize(...)函数，发现所有的驱动设备device的成员kset集合对象都指向了devices_kset内核集合，即父对象，该函数源码如下：

void device_initialize(struct device *dev)
{
  //dev设备加入到devices_kset kset集合下面，即/sys/devices/目录下
  dev->kobj.kset = devices_kset; 
  kobject_init(&dev->kobj, &device_ktype);
  INIT_LIST_HEAD(&dev->dma_pools);
  mutex_init(&dev->mutex);
  lockdep_set_novalidate_class(&dev->mutex);
  spin_lock_init(&dev->devres_lock);
  INIT_LIST_HEAD(&dev->devres_head);
  device_pm_init(dev);
  set_dev_node(dev, -1);
}

接下来创建了dev_kobj内核对象，sysfs_dev_block_kobj、sysfs_dev_char_kobj内核对象的父对象都为dev_kobj，同样我们可以从内核中看到字符设备和块设备驱动在注册的时候分别调用sysfs_dev_block_kobj、sysfs_dev_char_kob内核对象。

4.1 内核集合kset创建，函数kset_create_and_add(...)

功能：创建kset内核对象集合devices ，即为/sys/devices

devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL);

struct kset *kset_create_and_add(const char *name,
         const struct kset_uevent_ops *uevent_ops,
         struct kobject *parent_kobj)
{
  struct kset *kset;
  int error;

  kset = kset_create(name, uevent_ops, parent_kobj); //见4.1.1
  if (!kset)
    return NULL;
  error = kset_register(kset); //见4.1.2
  if (error) {
    kfree(kset);
    return NULL;
  }
  return kset;
}

4.1.1 内核集合创建

static struct kset *kset_create(const char *name,
        const struct kset_uevent_ops *uevent_ops,
        struct kobject *parent_kobj)
{
  struct kset *kset;
  int retval;

  kset = kzalloc(sizeof(*kset), GFP_KERNEL);
  if (!kset)
    return NULL;
  retval = kobject_set_name(&kset->kobj, name); //设置内核对象kobj名字
  if (retval) {
    kfree(kset);
    return NULL;
  }
  kset->uevent_ops = uevent_ops; //uevent_ops:表示通知用户层事件，下面会重点讲到！
  kset->kobj.parent = parent_kobj; //内核对象的父对象，可以为NULL

  /*
   * The kobject of this kset will have a type of kset_ktype and belong to
   * no kset itself.  That way we can properly free it when it is
   * finished being used.
   */

  //kobject type:表征内核对象kobject的类型，我们知道kset是内核的集
  //合，kset它是通过不同的kobject type内核对象类型进行分类集合的！
  kset->kobj.ktype = &kset_ktype; //见下面
  kset->kobj.kset = NULL;

  return kset;
}

static int dev_uevent_filter(struct kset *kset, struct kobject *kobj)
{
  struct kobj_type *ktype = get_ktype(kobj);


  if (ktype == &device_ktype) {
    struct device *dev = kobj_to_dev(kobj);
    if (dev->bus)
      return 1;
    if (dev->class)
      return 1;
  }
  return 0;
}

static const char *dev_uevent_name(struct kset *kset, struct kobject *kobj)
{
  struct device *dev = kobj_to_dev(kobj);


  if (dev->bus)
    return dev->bus->name;
  if (dev->class)
    return dev->class->name;
  return NULL;
}

//需要分析该函数
static int dev_uevent(struct kset *kset, struct kobject *kobj,
          struct kobj_uevent_env *env)
{
  struct device *dev = kobj_to_dev(kobj);
  int retval = 0;


  /* add device node properties if present */
  if (MAJOR(dev->devt)) {
    const char *tmp;
    const char *name;
    umode_t mode = 0;
    kuid_t uid = GLOBAL_ROOT_UID;
    kgid_t gid = GLOBAL_ROOT_GID;


    add_uevent_var(env, "MAJOR=%u", MAJOR(dev->devt)); //增加主设备MAJOR
    add_uevent_var(env, "MINOR=%u", MINOR(dev->devt)); //增加从设备MINOR
    name = device_get_devnode(dev, &mode, &uid, &gid, &tmp);
    if (name) {
      add_uevent_var(env, "DEVNAME=%s", name);
      if (mode)
        add_uevent_var(env, "DEVMODE=%#o", mode & 0777);
      if (!uid_eq(uid, GLOBAL_ROOT_UID))
        add_uevent_var(env, "DEVUID=%u", from_kuid(&init_user_ns, uid));
      if (!gid_eq(gid, GLOBAL_ROOT_GID))
        add_uevent_var(env, "DEVGID=%u", from_kgid(&init_user_ns, gid));
      kfree(tmp);
    }
  }


  if (dev->type && dev->type->name)
    add_uevent_var(env, "DEVTYPE=%s", dev->type->name);


  if (dev->driver)
    add_uevent_var(env, "DRIVER=%s", dev->driver->name);


  /* Add common DT information about the device */
  of_device_uevent(dev, env);


  /* have the bus specific function add its stuff */
  if (dev->bus && dev->bus->uevent) {
    retval = dev->bus->uevent(dev, env);
    if (retval)
      pr_debug("device: '%s': %s: bus uevent() returned %d\n",
         dev_name(dev), __func__, retval);
  }


  /* have the class specific function add its stuff */
  if (dev->class && dev->class->dev_uevent) {
    retval = dev->class->dev_uevent(dev, env);
    if (retval)
      pr_debug("device: '%s': %s: class uevent() "
         "returned %d\n", dev_name(dev),
         __func__, retval);
  }


  /* have the device type specific function add its stuff */
  if (dev->type && dev->type->uevent) {
    retval = dev->type->uevent(dev, env);
    if (retval)
      pr_debug("device: '%s': %s: dev_type uevent() "
         "returned %d\n", dev_name(dev),
         __func__, retval);
  }


  return retval;
}

//这个用户层通知事件结构体后续会重点讲到！
static const struct kset_uevent_ops device_uevent_ops = {
  .filter =  dev_uevent_filter,
  .name =    dev_uevent_name,
  .uevent =  dev_uevent,
};
static struct kobj_type kset_ktype = {
  .sysfs_ops  = &kobj_sysfs_ops,
  .release = kset_release,
};

4.1.2 内核集合注册

int kset_register(struct kset *k)
{
  int err;

  if (!k)
    return -EINVAL;

  kset_init(k); //内核集合kset初始化
  err = kobject_add_internal(&k->kobj);
  if (err)
    return err;
  kobject_uevent(&k->kobj, KOBJ_ADD); //内核事件，主要的目的是通过kmod\netlink方式的方式由内核通知用户层底层驱动有变动，相当于热插拔功能
  return 0;
}

void kset_init(struct kset *k)
{
  kobject_init_internal(&k->kobj);
  INIT_LIST_HEAD(&k->list); //注意这个链表，它表示kset集合下的kobj内核对象链表
  spin_lock_init(&k->list_lock);
}

static void kobject_init_internal(struct kobject *kobj)
{
  if (!kobj)
    return;
  kref_init(&kobj->kref);
  INIT_LIST_HEAD(&kobj->entry); //表示父内核对象下挂接的子内核对象
  kobj->state_in_sysfs = 0; //sysfs没有初始化
  kobj->state_add_uevent_sent = 0; //增加发送给用户层的事件状态
  kobj->state_remove_uevent_sent = 0;//移除发送给用户层的事件状态
  kobj->state_initialized = 1; //kobj对象初始化完成标识
}

4.1.2 内核对象事件--环境变量添加

功能：内核集合kset或内核对象kobject添加完成时需要通知用户层

kobject_uevent(&k->kobj, KOBJ_ADD); //内核事件，主要的目的是通知（kmod\netlink）用户层底层驱动有变动，相当于热插拔功能

int kobject_uevent(struct kobject *kobj, enum kobject_action action)
{
  return kobject_uevent_env(kobj, action, NULL);
}

Uevent的机制是比较简单的，设备模型中任何设备有事件需要上报时，会触发Uevent提供的接口。Uevent模块准备好上报事件的格式后，可以通过两个途径把事件上报到用户空间：一种是通过kmod模块，直接调用用户空间的可执行文件；另一种是通过netlink通信机制，将事件从内核空间传递给用户空间。

int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
           char *envp_ext[])
{
  struct kobj_uevent_env *env;
  const char *action_string = kobject_actions[action];
  const char *devpath = NULL;
  const char *subsystem;
  struct kobject *top_kobj;
  struct kset *kset;
  const struct kset_uevent_ops *uevent_ops;
  int i = 0;
  int retval = 0;
#ifdef CONFIG_NET
  struct uevent_sock *ue_sk;
#endif

  pr_debug("kobject: '%s' (%p): %s\n",
     kobject_name(kobj), kobj, __func__);

  /* search the kset we belong to */
  top_kobj = kobj;
  while (!top_kobj->kset && top_kobj->parent)
    top_kobj = top_kobj->parent;

  if (!top_kobj->kset) {
    pr_debug("kobject: '%s' (%p): %s: attempted to send uevent "
       "without kset!\n", kobject_name(kobj), kobj,
       __func__);
    return -EINVAL;
  }

  kset = top_kobj->kset;
  uevent_ops = kset->uevent_ops;

  /* skip the event, if uevent_suppress is set*/
  if (kobj->uevent_suppress) {
    pr_debug("kobject: '%s' (%p): %s: uevent_suppress "
         "caused the event to drop!\n",
         kobject_name(kobj), kobj, __func__);
    return 0;
  }
  /* skip the event, if the filter returns zero. */
  if (uevent_ops && uevent_ops->filter)
    if (!uevent_ops->filter(kset, kobj)) {
      pr_debug("kobject: '%s' (%p): %s: filter function "
         "caused the event to drop!\n",
         kobject_name(kobj), kobj, __func__);
      return 0;
    }

  /* originating subsystem */
  if (uevent_ops && uevent_ops->name)
    subsystem = uevent_ops->name(kset, kobj);
  else
    subsystem = kobject_name(&kset->kobj);
  if (!subsystem) {
    pr_debug("kobject: '%s' (%p): %s: unset subsystem caused the "
       "event to drop!\n", kobject_name(kobj), kobj,
       __func__);
    return 0;
  }

  /* environment buffer */
  env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
  if (!env)
    return -ENOMEM;

  /* complete object path */
  devpath = kobject_get_path(kobj, GFP_KERNEL); //创建绝对路径，如:/sys/dev/char
  if (!devpath) {
    retval = -ENOENT;
    goto exit;
  }

  /* default keys */
  retval = add_uevent_var(env, "ACTION=%s", action_string); //增加操作到env中
  if (retval)
    goto exit;
  retval = add_uevent_var(env, "DEVPATH=%s", devpath); //增加路径到env中
  if (retval)
    goto exit;
  retval = add_uevent_var(env, "SUBSYSTEM=%s", subsystem);
  if (retval)
    goto exit;

  /* keys passed in from the caller */
  if (envp_ext) {
    for (i = 0; envp_ext[i]; i++) {
      retval = add_uevent_var(env, "%s", envp_ext[i]);
      if (retval)
        goto exit;
    }
  }

  /* let the kset specific function add its stuff */
  if (uevent_ops && uevent_ops->uevent) {
    retval = uevent_ops->uevent(kset, kobj, env); //这里将调用dev_uevent(...)函数增加kset的一些其它特殊功能，见该函数源码
    if (retval) {
      pr_debug("kobject: '%s' (%p): %s: uevent() returned "
         "%d\n", kobject_name(kobj), kobj,
         __func__, retval);
      goto exit;
    }
  }

  /*
   * Mark "add" and "remove" events in the object to ensure proper
   * events to userspace during automatic cleanup. If the object did
   * send an "add" event, "remove" will automatically generated by
   * the core, if not already done by the caller.
   */
  if (action == KOBJ_ADD)
    kobj->state_add_uevent_sent = 1;
  else if (action == KOBJ_REMOVE)
    kobj->state_remove_uevent_sent = 1;

  mutex_lock(&uevent_sock_mutex);
  /* we will send an event, so request a new sequence number */
  retval = add_uevent_var(env, "SEQNUM=%llu", (unsigned long long)++uevent_seqnum); //
  if (retval) {
    mutex_unlock(&uevent_sock_mutex);
    goto exit;
  }

#if defined(CONFIG_NET) //这里是采用netlink来通知用户空间
  /* send netlink message */
  list_for_each_entry(ue_sk, &uevent_sock_list, list) {
    struct sock *uevent_sock = ue_sk->sk;
    struct sk_buff *skb;
    size_t len;

    if (!netlink_has_listeners(uevent_sock, 1))
      continue;

    /* allocate message with the maximum possible size */
    len = strlen(action_string) + strlen(devpath) + 2;
    skb = alloc_skb(len + env->buflen, GFP_KERNEL);
    if (skb) {
      char *scratch;

      /* add header */
      scratch = skb_put(skb, len);
      sprintf(scratch, "%s@%s", action_string, devpath);

      /* copy keys to our continuous event payload buffer */
      for (i = 0; i < env->envp_idx; i++) {
        len = strlen(env->envp[i]) + 1;
        scratch = skb_put(skb, len);
        strcpy(scratch, env->envp[i]);
      }

      NETLINK_CB(skb).dst_group = 1;
      retval = netlink_broadcast_filtered(uevent_sock, skb,
                  0, 1, GFP_KERNEL,
                  kobj_bcast_filter,
                  kobj);
      /* ENOBUFS should be handled in userspace */
      if (retval == -ENOBUFS || retval == -ESRCH)
        retval = 0;
    } else
      retval = -ENOMEM;
  }
#endif
  mutex_unlock(&uevent_sock_mutex);

  /* call uevent_helper, usually only enabled during early boot */
  if (uevent_helper[0] && !kobj_usermode_filter(kobj)) {
    char *argv [3];

    argv [0] = uevent_helper;
    argv [1] = (char *)subsystem;
    argv [2] = NULL;
    retval = add_uevent_var(env, "HOME=/");
    if (retval)
      goto exit;
    retval = add_uevent_var(env,
          "PATH=/sbin:/bin:/usr/sbin:/usr/bin");
    if (retval)
      goto exit;

    retval = call_usermodehelper(argv[0], argv,
               env->envp, UMH_WAIT_EXEC); //见4.1.3
  }

exit:
  kfree(devpath);
  kfree(env);
  return retval;
}

从kobject_uevent_env（...）函数分析，为了在内核空间里调用用户空间里的程序"sbin/hotplug"，需要为该应用程序增加如下的环境变量：

add_uevent_var(env, "ACTION=%s", action_string); //增加操作到env中, 如：ACTION="add"
add_uevent_var(env, "DEVPATH=%s", devpath); //增加路径到env中， 如：DEVPATH=“/sys/dev/char”
add_uevent_var(env, "SUBSYSTEM=%s", subsystem); //增加子系统路径，这里应该是SUBSYSTEM="bus->name"总线的路径
add_uevent_var(env, "MAJOR=%u", MAJOR(dev->devt));
add_uevent_var(env, "MINOR=%u", MINOR(dev->devt));
add_uevent_var(env, "DEVNAME=%s", name);
add_uevent_var(env, "DEVMODE=%#o", mode & 0777);
add_uevent_var(env, "DEVUID=%u", from_kuid(&init_user_ns, uid));
add_uevent_var(env, "DEVGID=%u", from_kgid(&init_user_ns, gid));
add_uevent_var(env, "DEVTYPE=%s", dev->type->name);
add_uevent_var(env, "DRIVER=%s", dev->driver->name);
add_uevent_var(env, "SEQNUM=%llu", (unsigned long long)++uevent_seqnum);
add_uevent_var(env, "HOME=/");
add_uevent_var(env, "PATH=/sbin:/bin:/usr/sbin:/usr/bin");

4.1.3 内核对象事件--设置、调用用户层用程序(kmod/netlink)

int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
           char *envp_ext[])
{
  //......
  
  /* call uevent_helper, usually only enabled during early boot */
  if (uevent_helper[0] && !kobj_usermode_filter(kobj)) {
    char *argv [3];

    argv [0] = uevent_helper; //uevent_helper="/sbin/hotplug"
    argv [1] = (char *)subsystem;
    argv [2] = NULL;
    retval = add_uevent_var(env, "HOME=/");
    if (retval)
      goto exit;
    retval = add_uevent_var(env,
          "PATH=/sbin:/bin:/usr/sbin:/usr/bin");
    if (retval)
      goto exit;

    retval = call_usermodehelper(argv[0], argv, //重要、待分析。。。
               env->envp, UMH_WAIT_EXEC);
  }
  
  //......
}

这里call_usermodehelper(...)函数类似于于execv的函数族，通过下面的函数说明，很容易理解各传入参数的意思：

/**
 * call_usermodehelper() - prepare and start a usermode application
 * @path: path to usermode executable
 * @argv: arg vector for process
 * @envp: environment for process
 * @wait: wait for the application to finish and return status.
 *        when UMH_NO_WAIT don't wait at all, but you get no useful error back
 *        when the program couldn't be exec'ed. This makes it safe to call
 *        from interrupt context.
 *
 * This function is the equivalent to use call_usermodehelper_setup() and
 * call_usermodehelper_exec().
 */
int call_usermodehelper(char *path, char **argv, char **envp, int wait)
{
  struct subprocess_info *info;
  gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;

  //将传入的参数导入到子进程信息info结构体中,并初始化工作队列、绑定工作队列的回调函数
  info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
           NULL, NULL, NULL); //见4.1.3.1
  if (info == NULL)
    return -ENOMEM;

  return call_usermodehelper_exec(info, wait); //内部会调用“khelper”线程来完成用户空间的通知
}

4.1.3.1 内核对象事件--设置用户层应用程序（/sbin/hotplug）配置

关于call_usermodehelper_xxx函数的使用，可以看下这位大神的文章点击打开链接。

设置用户层程序的参数配置，包括环境变量、工作队列初始化及工作队列的回调函数，call_usermodehelper_setup(...)源码如下：

struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
    char **envp, gfp_t gfp_mask,
    int (*init)(struct subprocess_info *info, struct cred *new),
    void (*cleanup)(struct subprocess_info *info),
    void *data)
{
  struct subprocess_info *sub_info;
  sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
  if (!sub_info)
    goto out;

  //初始化工作队列，并指定工作队列执行时的回调函数__call_usermodehelper
  INIT_WORK(&sub_info->work, __call_usermodehelper); 
  sub_info->path = path;
  sub_info->argv = argv;
  sub_info->envp = envp;

  sub_info->cleanup = cleanup;
  sub_info->init = init;
  sub_info->data = data;
  out:
  return sub_info;
}

初始化工作队列、并设置其回调函数，__call_usermodehelper(...)函数源码如下：

static void __call_usermodehelper(struct work_struct *work)
{
  struct subprocess_info *sub_info =
    container_of(work, struct subprocess_info, work);
  int wait = sub_info->wait & ~UMH_KILLABLE;
  pid_t pid;

  /* CLONE_VFORK: wait until the usermode helper has execve'd
   * successfully We need the data structures to stay around
   * until that is done.  */
  if (wait == UMH_WAIT_PROC)
    pid = kernel_thread(wait_for_helper, sub_info,
            CLONE_FS | CLONE_FILES | SIGCHLD);
  else {
    pid = kernel_thread(call_helper, sub_info,
            CLONE_VFORK | SIGCHLD);
    /* Worker thread stopped blocking khelper thread. */
    kmod_thread_locker = NULL;
  }

  switch (wait) {
  case UMH_NO_WAIT:
    call_usermodehelper_freeinfo(sub_info);
    break;

  case UMH_WAIT_PROC:
    if (pid > 0)
      break;
    /* FALLTHROUGH */
  case UMH_WAIT_EXEC:
    if (pid < 0)
      sub_info->retval = pid;
    umh_complete(sub_info);
  }
}

根据wait参数，在上面创建了两个线程“wait_for_helper(...)”和“call_helper(...)”，通过进入函数内部分析，他们都调用了____call_usermodehelper(...)函数，源码如下：

static int ____call_usermodehelper(void *data)
{
  struct subprocess_info *sub_info = data;
  struct cred *new;
  int retval;

  spin_lock_irq(¤t->sighand->siglock);
  flush_signal_handlers(current, 1);
  spin_unlock_irq(¤t->sighand->siglock);

  /* We can run anywhere, unlike our parent keventd(). */
  set_cpus_allowed_ptr(current, cpu_all_mask);

  /*
   * Our parent is keventd, which runs with elevated scheduling priority.
   * Avoid propagating that into the userspace child.
   */
  set_user_nice(current, 0);

  retval = -ENOMEM;
  new = prepare_kernel_cred(current);
  if (!new)
    goto fail;

  spin_lock(&umh_sysctl_lock);
  new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
  new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
               new->cap_inheritable);
  spin_unlock(&umh_sysctl_lock);

  if (sub_info->init) {
    retval = sub_info->init(sub_info, new);
    if (retval) {
      abort_creds(new);
      goto fail;
    }
  }

  commit_creds(new);

  retval = do_execve(sub_info->path,
         (const char __user *const __user *)sub_info->argv,
         (const char __user *const __user *)sub_info->envp);
  if (!retval)
    return 0;

  /* Exec failed? */
fail:
  sub_info->retval = retval;
  do_exit(0);
}

到这里我们见到了熟知的do_execve(...)函数，它的作用执行一个应用程序，这里就是执行"/sbin/hotplug"应用程序。

4.1.3.2 内核对象事件--调用用户层应用程序（/sbin/hotplug）

在4.1.3.2中详细的说明了工作队列的回调函数__call_usermodehelper(...)是如何启动用户层应用程序“/sbin/hotpulg”，这里是主要分析如何将工作队列加入到khelper线程的工作队列里，call_usermodehelper_exec(...)函数如下：

int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
{
  DECLARE_COMPLETION_ONSTACK(done);
  int retval = 0;

  helper_lock();
  if (!sub_info->path) {
    retval = -EINVAL;
    goto out;
  }

  if (sub_info->path[0] == '\0')
    goto out;

  if (!khelper_wq || usermodehelper_disabled) {
    retval = -EBUSY;
    goto out;
  }
  /*
   * Worker thread must not wait for khelper thread at below
   * wait_for_completion() if the thread was created with CLONE_VFORK
   * flag, for khelper thread is already waiting for the thread at
   * wait_for_completion() in do_fork().
   */
  if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
    retval = -EBUSY;
    goto out;
  }

  sub_info->complete = &done;
  sub_info->wait = wait;

  //内核在启动的时候创建的“khelper”线程，这里是将节点sub_info->work加入到khelper_wq工作队列中，同时唤醒对应的进程
  queue_work(khelper_wq, &sub_info->work); 
  if (wait == UMH_NO_WAIT)  /* task has freed sub_info */
    goto unlock;

  if (wait & UMH_KILLABLE) {
    retval = wait_for_completion_killable(&done);
    if (!retval)
      goto wait_done;

    /* umh_complete() will see NULL and free sub_info */
    if (xchg(&sub_info->complete, NULL))
      goto unlock;
    /* fallthrough, umh_complete() was already called */
  }

  wait_for_completion(&done); //等待队列完成
wait_done:
  retval = sub_info->retval;
out:
  call_usermodehelper_freeinfo(sub_info);
unlock:
  helper_unlock();
  return retval;
}

总结：4.1节涉及的内容较多，主要是kset内核集合“devices” 创建、注册，注册“devices”时，通过/shin/hotplug热插拔程序通知用户层，U盘的热插拔就是这个原理。

这里还有一个问题没有解决，/sbin/hotplug内部是如何实现的？？？？？？？？？？？

4.2 内核对象kobject创建，函数kobject_create_and_add(...)

struct kobject *kobject_create_and_add(const char *name, struct kobject *parent)
{
  struct kobject *kobj;
  int retval;

  kobj = kobject_create();
  if (!kobj)
    return NULL;

  //为当前kobj对象创建绝对路径/parent.name/name，同时将当前创建的kobj对象指向parent对象,即形成对象链表，我们看到文件系统/sys/xx这种目录结构
  retval = kobject_add(kobj, parent, "%s", name); 
  if (retval) {
    printk(KERN_WARNING "%s: kobject_add error: %d\n",
           __func__, retval);
    kobject_put(kobj);
    kobj = NULL;
  }
  return kobj;
}

而kobject_add(...)-->kobject_add_varg(...)-->kobject_set_name_vargs(...)和kobject_add_internal(...)这两个函数在4.1节有分析过，这里不再赘述，至此理解内核对象的创建、添加，即理解了文件系统/sys/xx下的原理！

4.3 内核对象链表创建

4.4 netlink通信机制

另外netlink/kmod通讯机制，最终都会调用/sbin/hotplug，即mdev热插拔，由于mdev水太深，另起一节介绍，我们这里只需要了解内核对象、内核集合的工作原理，它们最终调用用户层/sbin/hotplug(mdev热插拔)完成设备在/dev下的注册。

4.5 热插拔机制/sbin/hotplug(mdev)
/sbin/hotplug使用的是mdev工具，通过busybox找到mdev源码如下：

int mdev_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
int mdev_main(int argc UNUSED_PARAM, char **argv)
{
  RESERVE_CONFIG_BUFFER(temp, PATH_MAX + SCRATCH_SIZE);

  /* We can be called as hotplug helper */
  /* Kernel cannot provide suitable stdio fds for us, do it ourself */
#if 1
  bb_sanitize_stdio();
#else
  /* Debug code */
  /* Replace LOGFILE by other file or device name if you need */
#define LOGFILE "/dev/console"
  /* Just making sure fd 0 is not closed,
   * we don't really intend to read from it */
  xmove_fd(xopen("/", O_RDONLY), STDIN_FILENO);
  xmove_fd(xopen(LOGFILE, O_WRONLY|O_APPEND), STDOUT_FILENO);
  xmove_fd(xopen(LOGFILE, O_WRONLY|O_APPEND), STDERR_FILENO);
#endif

  xchdir("/dev"); //设置当前目录为/dev

  if (argv[1] && !strcmp(argv[1], "-s")) { //条件不成立，进入else部分
    /* Scan:
     * mdev -s
     */
    struct stat st;

    xstat("/", &st);
    root_major = major(st.st_dev);
    root_minor = minor(st.st_dev);

    /* ACTION_FOLLOWLINKS is needed since in newer kernels
     * /sys/block/loop* (for example) are symlinks to dirs,
     * not real directories.
     * (kernel's CONFIG_SYSFS_DEPRECATED makes them real dirs,
     * but we can't enforce that on users) */
    recursive_action("/sys/block",
      ACTION_RECURSE | ACTION_FOLLOWLINKS,
      fileAction, dirAction, temp, 0);
    recursive_action("/sys/class",
      ACTION_RECURSE | ACTION_FOLLOWLINKS,
      fileAction, dirAction, temp, 0);
  } else {
    char *seq;
    char *action;
    char *env_path;
    char seqbuf[sizeof(int)*3 + 2];
    int seqlen = seqlen; /* for compiler */

    /* Hotplug:
     * env ACTION=... DEVPATH=... [SEQNUM=...] mdev
     * ACTION can be "add" or "remove"
     * DEVPATH is like "/block/sda" or "/class/input/mice"
     */
    action = getenv("ACTION"); //获取ACTION环境变量，在上面的4.1.2节有详细描述
    env_path = getenv("DEVPATH");//获取设备路径环境变量，在上面的4.1.2节有详细描述
    if (!action || !env_path)
      bb_show_usage();

    seq = getenv("SEQNUM");
    if (seq) {
      int timeout = 2000 / 32;
      do {
        seqlen = open_read_close("mdev.seq", seqbuf, sizeof(seqbuf-1));
        if (seqlen < 0)
          break;
        seqbuf[seqlen] = '\0';
        if (seqbuf[0] == '\n' /* seed file? */
         || strcmp(seq, seqbuf) == 0 /* correct idx? */
        ) {
          break;
        }
        usleep(32*1000);
      } while (--timeout);
    }

    snprintf(temp, PATH_MAX, "/sys%s", env_path); //这里要特别注意，之前在4.1.2节中以为加了/sys路径，其实/sys路径是在这里添加的，即/sys/dev/char
    if (!strcmp(action, "remove"))
      make_device(temp, 1);
    else if (!strcmp(action, "add")) { //成立
      make_device(temp, 0); //见下面

      if (ENABLE_FEATURE_MDEV_LOAD_FIRMWARE) {
        char *fw = getenv("FIRMWARE");
        if (fw)
          load_firmware(fw, temp);
      }
    }

    if (seq && seqlen >= 0) {
      xopen_xwrite_close("mdev.seq", utoa(xatou(seq) + 1));
    }
  }

  if (ENABLE_FEATURE_CLEAN_UP)
    RELEASE_CONFIG_BUFFER(temp);

  return 0;
}
获取设备路径环境变量，在上面的4.1.2节有详细描述
    if (!action || !env_path)
      bb_show_usage();

    seq = getenv("SEQNUM");
    if (seq) {
      int timeout = 2000 / 32;
      do {
        seqlen = open_read_close("mdev.seq", seqbuf, sizeof(seqbuf-1));
        if (seqlen < 0)
          break;
        seqbuf[seqlen] = '\0';
        if (seqbuf[0] == '\n' /* seed file? */
         || strcmp(seq, seqbuf) == 0 /* correct idx? */
        ) {
          break;
        }
        usleep(32*1000);
      } while (--timeout);
    }

    snprintf(temp, PATH_MAX, "/sys%s", env_path); //这里要特别注意，之前在4.1.2节中以为加了/sys路径，其实/sys路径是在这里添加的，即/sys/dev/char
    if (!strcmp(action, "remove"))
      make_device(temp, 1);
    else if (!strcmp(action, "add")) { //成立
      make_device(temp, 0); //见下面

      if (ENABLE_FEATURE_MDEV_LOAD_FIRMWARE) {
        char *fw = getenv("FIRMWARE");
        if (fw)
          load_firmware(fw, temp);
      }
    }

    if (seq && seqlen >= 0) {
      xopen_xwrite_close("mdev.seq", utoa(xatou(seq) + 1));
    }
  }

  if (ENABLE_FEATURE_CLEAN_UP)
    RELEASE_CONFIG_BUFFER(temp);

  return 0;
}

在/dev下创建设备节点make_device(...)函数，源码如下：

/* mknod in /dev based on a path like "/sys/block/hda/hda1" */
/* NB: "mdev -s" may call us many times, do not leak memory/fds! */
static void make_device(char *path, int delete)
{
  const char *device_name;
  int major, minor, type, len;
  int mode = 0660;
#if ENABLE_FEATURE_MDEV_CONF
  struct bb_uidgid_t ugid = { 0, 0 };
  parser_t *parser;
  char *tokens[5];
#endif
#if ENABLE_FEATURE_MDEV_EXEC
  char *command = NULL;
#endif
#if ENABLE_FEATURE_MDEV_RENAME
  char *alias = NULL;
  char aliaslink = aliaslink; /* for compiler */
#endif
  char *dev_maj_min = path + strlen(path);

  /* Force the configuration file settings exactly. */
  umask(0);

  /* Try to read major/minor string.  Note that the kernel puts \n after
   * the data, so we don't need to worry about null terminating the string
   * because sscanf() will stop at the first nondigit, which \n is.
   * We also depend on path having writeable space after it.
   */
  major = -1;
  if (!delete) { //增加设备
    strcpy(dev_maj_min, "/dev");
    len = open_read_close(path, dev_maj_min + 1, 64); //这里主要是没有理解path的具体内容是什么？？？是不是对应类似/dev/rtc0这样的设备路径？
    *dev_maj_min++ = '\0';
    if (len < 1) {
      if (!ENABLE_FEATURE_MDEV_EXEC)
        return;
      /* no "dev" file, so just try to run script */
      *dev_maj_min = '\0';
    } else if (sscanf(dev_maj_min, "%u:%u", &major, &minor) != 2) {
      major = -1;
    }
  }

  /* Determine device name, type, major and minor */
  device_name = bb_basename(path); //从path绝对路径中提取设备的名字
  /* http://kernel.org/doc/pending/hotplug.txt says that only
   * "/sys/block/..." is for block devices. "/sys/bus" etc is not.
   * But since 2.6.25 block devices are also in /sys/class/block.
   * We use strstr("/block/") to forestall future surprises. */
  type = S_IFCHR;
  if (strstr(path, "/block/")) //如果是块设备就标注为块设备类型
    type = S_IFBLK;

#if ENABLE_FEATURE_MDEV_CONF
  parser = config_open2("/etc/mdev.conf", fopen_for_read);

  /* If we have config file, look up user settings */
  while (config_read(parser, tokens, 4, 3, "# \t", PARSE_NORMAL)) {
    regmatch_t off[1 + 9*ENABLE_FEATURE_MDEV_RENAME_REGEXP];
    char *val;

    /* Fields: regex uid:gid mode [alias] [cmd] */

    /* 1st field: @<numeric maj,min>... */
    if (tokens[0][0] == '@') {
      /* @major,minor[-last] */
      /* (useful when name is ambiguous:
       * "/sys/class/usb/lp0" and
       * "/sys/class/printer/lp0") */
      int cmaj, cmin0, cmin1, sc;
      if (major < 0)
        continue; /* no dev, no match */
      sc = sscanf(tokens[0], "@%u,%u-%u", &cmaj, &cmin0, &cmin1);
      if (sc < 1 || major != cmaj
       || (sc == 2 && minor != cmin0)
       || (sc == 3 && (minor < cmin0 || minor > cmin1))
      ) {
        continue; /* no match */
      }
    } else { /* ... or regex to match device name */
      regex_t match;
      int result;

      /* Is this it? */
      xregcomp(&match, tokens[0], REG_EXTENDED);
      result = regexec(&match, device_name, ARRAY_SIZE(off), off, 0);
      regfree(&match);

      //bb_error_msg("matches:");
      //for (int i = 0; i < ARRAY_SIZE(off); i++) {
      //  if (off[i].rm_so < 0) continue;
      //  bb_error_msg("match %d: '%.*s'\n", i,
      //    (int)(off[i].rm_eo - off[i].rm_so),
      //    device_name + off[i].rm_so);
      //}

      /* If not this device, skip rest of line */
      /* (regexec returns whole pattern as "range" 0) */
      if (result || off[0].rm_so
       || ((int)off[0].rm_eo != (int)strlen(device_name))
      ) {
        continue;
      }
    }

    /* This line matches: stop parsing the file
     * after parsing the rest of fields */

    /* 2nd field: uid:gid - device ownership */
    parse_chown_usergroup_or_die(&ugid, tokens[1]);

    /* 3rd field: mode - device permissions */
    mode = strtoul(tokens[2], NULL, 8);

    val = tokens[3];
    /* 4th field (opt): >alias */
#if ENABLE_FEATURE_MDEV_RENAME
    if (!val)
      break;
    aliaslink = *val;
    if (aliaslink == '>' || aliaslink == '=') {
      char *s;
#if ENABLE_FEATURE_MDEV_RENAME_REGEXP
      char *p;
      unsigned i, n;
#endif
      char *a = val;
      s = strchr(val, ' ');
      val = (s && s[1]) ? s+1 : NULL;
#if ENABLE_FEATURE_MDEV_RENAME_REGEXP
      /* substitute %1..9 with off[1..9], if any */
      n = 0;
      s = a;
      while (*s)
        if (*s++ == '%')
          n++;

      p = alias = xzalloc(strlen(a) + n * strlen(device_name));
      s = a + 1;
      while (*s) {
        *p = *s;
        if ('%' == *s) {
          i = (s[1] - '0');
          if (i <= 9 && off[i].rm_so >= 0) {
            n = off[i].rm_eo - off[i].rm_so;
            strncpy(p, device_name + off[i].rm_so, n);
            p += n - 1;
            s++;
          }
        }
        p++;
        s++;
      }
#else
      alias = xstrdup(a + 1);
#endif
    }
#endif /* ENABLE_FEATURE_MDEV_RENAME */

#if ENABLE_FEATURE_MDEV_EXEC
    /* The rest (opt): command to run */
    if (!val)
      break;
    {
      const char *s = "@$*";
      const char *s2 = strchr(s, *val);

      if (!s2)
        bb_error_msg_and_die("bad line %u", parser->lineno);

      /* Correlate the position in the "@$*" with the delete
       * step so that we get the proper behavior:
       * @cmd: run on create
       * $cmd: run on delete
       * *cmd: run on both
       */
      if ((s2 - s + 1) /*1/2/3*/ & /*1/2*/ (1 + delete)) {
        command = xstrdup(val + 1);
      }
    }
#endif
    /* end of field parsing */
    break; /* we found matching line, stop */
  } /* end of "while line is read from /etc/mdev.conf" */

  config_close(parser);
#endif /* ENABLE_FEATURE_MDEV_CONF */

  if (!delete && major >= 0) {

    if (ENABLE_FEATURE_MDEV_RENAME)
      unlink(device_name);

    if (mknod(device_name, mode | type, makedev(major, minor)) && errno != EEXIST)
      bb_perror_msg_and_die("mknod %s", device_name); //在/dev下创建设备节点

    if (major == root_major && minor == root_minor)
      symlink(device_name, "root");

#if ENABLE_FEATURE_MDEV_CONF
    chown(device_name, ugid.uid, ugid.gid);

#if ENABLE_FEATURE_MDEV_RENAME
    if (alias) {
      alias = build_alias(alias, device_name);

      /* move the device, and optionally
       * make a symlink to moved device node */
      if (rename(device_name, alias) == 0 && aliaslink == '>')
        symlink(alias, device_name);

      free(alias);
    }
#endif
#endif
  }

#if ENABLE_FEATURE_MDEV_EXEC
  if (command) {
    /* setenv will leak memory, use putenv/unsetenv/free */
    char *s = xasprintf("MDEV=%s", device_name);
    putenv(s);
    if (system(command) == -1)
      bb_perror_msg_and_die("can't run '%s'", command);
    s[4] = '\0';
    unsetenv(s);
    free(s);
    free(command);
  }
#endif

  if (delete) {
    unlink(device_name);
    /* At creation time, device might have been moved
     * and a symlink might have been created. Undo that. */
#if ENABLE_FEATURE_MDEV_RENAME
    if (alias) {
      alias = build_alias(alias, device_name);
      unlink(alias);
      free(alias);
    }
#endif
  }
}

make_device函数最终会调用bb_perror_msg_and_die("mknod %s", device_name);中的mknod完成设备在/dev下的注册。

总结：

本篇涉及到的知识点比较多，包括内核集合kset、内核对象kobject，内核集合和内核对象之间的关联性，驱动在注册的时候最终会在内核层通过uevent机制调用用户层程序/sbin/hotplug(mdev)，在mdev里通过mknod工具完成设备的注册、并在文件系统/dev下创建相应的设备驱动节点。