1. 前言

     本博客主要分析在应用层使用ifconfig命令时所引发的系统调用、及内核路由表中的地址添加流程。

     FIB: forward information base, fib_info结构体

2. ifconfig命令

在应用层使用命令:ifconfig eth0 192.168.1.100 up 配置本地ip地址时将调用busybox源码中的ifconfig.c,具体流程如下

int ifconfig_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
int ifconfig_main(int argc UNUSED_PARAM, char **argv)
{
struct ifreq ifr;
struct sockaddr_in sai;
#if ENABLE_FEATURE_IFCONFIG_HW
struct sockaddr sa;
#endif
const struct arg1opt *a1op;
const struct options *op;
int sockfd; /* socket fd we use to manipulate stuff with */
int selector;
#if ENABLE_FEATURE_IFCONFIG_BROADCAST_PLUS
unsigned int mask;
unsigned int did_flags;
unsigned int sai_hostname, sai_netmask;
#else
unsigned char mask;
unsigned char did_flags;
#endif
char *p;
/*char host[128];*/
const char *host = NULL; /* make gcc happy */

did_flags = 0;
#if ENABLE_FEATURE_IFCONFIG_BROADCAST_PLUS
sai_hostname = 0;
sai_netmask = 0;
#endif

//ifconfig eth0 192.168.1.100 up

/* skip argv[0] */
++argv; //跳过 ifconfig

#if ENABLE_FEATURE_IFCONFIG_STATUS
//表示的命令是: ifconfig -a,下面的argv[0]=-a 表示列举所有的接口设备(eth0 usb0)
if (argv[0] && (argv[0][0] == '-' && argv[0][1] == 'a' && !argv[0][2])) {
interface_opt_a = 1;
++argv;
}
#endif

if (!argv[0] || !argv[1]) { /* one or no args */
#if ENABLE_FEATURE_IFCONFIG_STATUS
return display_interfaces(argv[0] /* can be NULL */); //执行ifconfig 命令显示
#else
bb_error_msg_and_die("no support for status display");
#endif
}

/* Create a channel to the NET kernel. */
sockfd = xsocket(AF_INET, SOCK_DGRAM, 0);

//实际配置ip地址时:ifconfig eth0 192.168.1.100 up
//所以:
//ifr.ifr_name = eth0
/* get interface name */
strncpy_IFNAMSIZ(ifr.ifr_name, *argv);

/* Process the remaining arguments. */
while (*++argv != (char *) NULL) { //这里就是指向ip:192.168.1.100
p = *argv; //这里就是指向ip:192.168.1.100
mask = N_MASK;
if (*p == '-') { /* If the arg starts with '-'... */
++p; /* advance past it and */
mask = M_MASK; /* set the appropriate mask. */
}

//要理解 OptArray 的内容需查看ifconfig --help

for (op = OptArray; op->name; op++) { /* Find table entry. */
if (strcmp(p, op->name) == 0) { /* If name matches... */
mask &= op->flags;
if (mask) /* set the mask and go. */
goto FOUND_ARG;
/* If we get here, there was a valid arg with an */
/* invalid '-' prefix. */
bb_error_msg_and_die("bad: '%s'", p-1);
}
}

/* We fell through, so treat as possible hostname. */
a1op = Arg1Opt + ARRAY_SIZE(Arg1Opt) - 1;
mask = op->arg_flags;
goto HOSTNAME; //上面不匹配参数,就表示是IP地址了,直接跳转到hostname(主机地址)

FOUND_ARG:
if (mask & ARG_MASK) {
mask = op->arg_flags;
a1op = Arg1Opt + (op - OptArray);
if (mask & A_NETMASK & did_flags)
bb_show_usage();
if (*++argv == NULL) {
if (mask & A_ARG_REQ)
bb_show_usage();
--argv;
mask &= A_SET_AFTER; /* just for broadcast */
} else { /* got an arg so process it */
HOSTNAME:
did_flags |= (mask & (A_NETMASK|A_HOSTNAME));
if (mask & A_CAST_HOST_COPY) {
#if ENABLE_FEATURE_IFCONFIG_HW //mac地址配置
if (mask & A_CAST_RESOLVE) {
#endif
#if ENABLE_FEATURE_IPV6 //IPV6
char *prefix;
int prefix_len = 0;
#endif
/*safe_strncpy(host, *argv, (sizeof host));*/
host = *argv; //ip地址: host=192.168.1.100
#if ENABLE_FEATURE_IPV6
prefix = strchr(host, '/');
if (prefix) {
prefix_len = xatou_range(prefix + 1, 0, 128);
*prefix = '\0';
}
#endif
sai.sin_family = AF_INET;
sai.sin_port = 0;
if (strcmp(host, "default") == 0) { //地址为默认0.0.0.0
/* Default is special, meaning 0.0.0.0. */
sai.sin_addr.s_addr = INADDR_ANY;
}
#if ENABLE_FEATURE_IFCONFIG_BROADCAST_PLUS
else if ((host[0] == '+' && !host[1]) && (mask & A_BROADCAST)
&& (did_flags & (A_NETMASK|A_HOSTNAME)) == (A_NETMASK|A_HOSTNAME)
) {
/* + is special, meaning broadcast is derived. */
sai.sin_addr.s_addr = (~sai_netmask) | (sai_hostname & sai_netmask);
}
#endif
else {
len_and_sockaddr *lsa;
if (strcmp(host, "inet") == 0)
continue; /* compat stuff */
lsa = xhost2sockaddr(host, 0); //配置主机地址
#if ENABLE_FEATURE_IPV6
if (lsa->u.sa.sa_family == AF_INET6) {
int sockfd6;
struct in6_ifreq ifr6;

memcpy((char *) &ifr6.ifr6_addr,
(char *) &(lsa->u.sin6.sin6_addr),
sizeof(struct in6_addr));

/* Create a channel to the NET kernel. */
sockfd6 = xsocket(AF_INET6, SOCK_DGRAM, 0);
xioctl(sockfd6, SIOGIFINDEX, &ifr);
ifr6.ifr6_ifindex = ifr.ifr_ifindex;
ifr6.ifr6_prefixlen = prefix_len;
ioctl_or_perror_and_die(sockfd6, a1op->selector, &ifr6, "SIOC%s", a1op->name);
if (ENABLE_FEATURE_CLEAN_UP)
free(lsa);
continue;
}
#endif
sai.sin_addr = lsa->u.sin.sin_addr;
if (ENABLE_FEATURE_CLEAN_UP)
free(lsa);
}
#if ENABLE_FEATURE_IFCONFIG_BROADCAST_PLUS
if (mask & A_HOSTNAME)
sai_hostname = sai.sin_addr.s_addr;
if (mask & A_NETMASK)
sai_netmask = sai.sin_addr.s_addr;
#endif
p = (char *) &sai;
#if ENABLE_FEATURE_IFCONFIG_HW
} else { /* A_CAST_HOST_COPY_IN_ETHER */
/* This is the "hw" arg case. */
smalluint hw_class= index_in_substrings("ether\0"
IF_FEATURE_HWIB("infiniband\0"), *argv) + 1;
if (!hw_class || !*++argv)
bb_show_usage();
/*safe_strncpy(host, *argv, sizeof(host));*/
host = *argv;
if (hw_class == 1 ? in_ether(host, &sa) : in_ib(host, &sa))
bb_error_msg_and_die("invalid hw-addr %s", host);
p = (char *) &sa;
}
#endif
memcpy( (((char *)&ifr) + a1op->ifr_offset),
p, sizeof(struct sockaddr));
} else {
/* FIXME: error check?? */
unsigned long i = strtoul(*argv, NULL, 0);
p = ((char *)&ifr) + a1op->ifr_offset;
#if ENABLE_FEATURE_IFCONFIG_MEMSTART_IOADDR_IRQ
if (mask & A_MAP_TYPE) {
xioctl(sockfd, SIOCGIFMAP, &ifr);
if ((mask & A_MAP_UCHAR) == A_MAP_UCHAR)
*((unsigned char *) p) = i;
else if (mask & A_MAP_USHORT)
*((unsigned short *) p) = i;
else
*((unsigned long *) p) = i;
} else
#endif
if (mask & A_CAST_CHAR_PTR)
*((caddr_t *) p) = (caddr_t) i;
else /* A_CAST_INT */
*((int *) p) = i;
}

ioctl_or_perror_and_die(sockfd, a1op->selector, &ifr, "SIOC%s", a1op->name); //最后调用这里设置IP地址
#ifdef QUESTIONABLE_ALIAS_CASE
if (mask & A_COLON_CHK) {
/*
* Don't do the set_flag() if the address is an alias with
* a '-' at the end, since it's deleted already! - Roman
*
* Should really use regex.h here, not sure though how well
* it'll go with the cross-platform support etc.
*/
char *ptr;
short int found_colon = 0;
for (ptr = ifr.ifr_name; *ptr; ptr++)
if (*ptr == ':')
found_colon++;
if (found_colon && ptr[-1] == '-')
continue;
}
#endif
}
if (!(mask & A_SET_AFTER))
continue;
mask = N_SET;
}

xioctl(sockfd, SIOCGIFFLAGS, &ifr);
selector = op->selector;
if (mask & SET_MASK)
ifr.ifr_flags |= selector;
else
ifr.ifr_flags &= ~selector;
xioctl(sockfd, SIOCSIFFLAGS, &ifr);
} /* while () */

if (ENABLE_FEATURE_CLEAN_UP)
close(sockfd);
return 0;
}

假设在应用层通过ifconfig配置IP地址,命令为:ifconfig eth0 192.168.1.100 up,那么在ifconfig_main()函数内部将通过形参将argv={"ifconfig", "eth0", "192.168.1.100", "up"}传入,然后ifconfig_main()完成对该二维数组的解析,最后调用如下函数

ioctl_or_perror_and_die(sockfd, a1op->selector, &ifr, "SIOC%s", a1op->name);  //最后调用这里设置IP地址

其中a1op->selector=SIOCSIFADDR

static const struct arg1opt Arg1Opt[] = {
//...
{ "SIFADDR", SIOCSIFADDR, ifreq_offsetof(ifr_addr) },
};

其中ioctl_or_perror_and_die()函数如下

ioctl_or_perror_and_die(sockfd, a1op->selector, &ifr, "SIOC%s", a1op->name);  //最后调用这里设置IP地址

int FAST_FUNC ioctl_or_perror_and_die(int fd, unsigned request, void *argp, const char *fmt,...)
{
int ret;
va_list p;

ret = ioctl(fd, request, argp); //系统调用
if (ret < 0) {
va_start(p, fmt);
bb_verror_msg(fmt, p, strerror(errno));
/* xfunc_die can actually longjmp, so be nice */
va_end(p);
xfunc_die();
}
return ret;
}

所以最终调用ioctl、request=SIOCSIFADDR,传入内核系统调用inet_ioctl()函数

3. ifconfig在内核部分的调用流程分析

在上面的xsocket中打开的协议类型为SOCK_DGRAM(即流套接字),所以会调用如下结构体中的inet_ioctl()函数

const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_stream_connect,
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = inet_getname,
.poll = tcp_poll,
.ioctl = inet_ioctl, //对应busybox 内部xioctl的系统调用接口
.listen = inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
.compat_ioctl = inet_compat_ioctl,
#endif
};
EXPORT_SYMBOL(inet_stream_ops);

inet_ioctl源码

int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
struct sock *sk = sock->sk;
int err = 0;
struct net *net = sock_net(sk);

switch (cmd) {
case SIOCGSTAMP:
err = sock_get_timestamp(sk, (struct timeval __user *)arg);
break;
case SIOCGSTAMPNS:
err = sock_get_timestampns(sk, (struct timespec __user *)arg);
break;
case SIOCADDRT:
case SIOCDELRT:
case SIOCRTMSG:
err = ip_rt_ioctl(net, cmd, (void __user *)arg);
break;
case SIOCDARP:
case SIOCGARP:
case SIOCSARP:
err = arp_ioctl(net, cmd, (void __user *)arg);
break;
case SIOCGIFADDR:
case SIOCSIFADDR: //该命令在xioctl中被调用
case SIOCGIFBRDADDR:
case SIOCSIFBRDADDR:
case SIOCGIFNETMASK:
case SIOCSIFNETMASK:
case SIOCGIFDSTADDR:
case SIOCSIFDSTADDR:
case SIOCSIFPFLAGS:
case SIOCGIFPFLAGS:
case SIOCSIFFLAGS:
err = devinet_ioctl(net, cmd, (void __user *)arg);
break;
default:
if (sk->sk_prot->ioctl)
err = sk->sk_prot->ioctl(sk, cmd, arg);
else
err = -ENOIOCTLCMD;
break;
}
return err;
}
EXPORT_SYMBOL(inet_ioctl);

通过在busybox中调用的命令 SIOCSIFADDR,知调用接口函数devinet_ioctl()

int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
{
struct ifreq ifr;
struct sockaddr_in sin_orig;
struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
struct in_device *in_dev;
struct in_ifaddr **ifap = NULL;
struct in_ifaddr *ifa = NULL;
struct net_device *dev;
char *colon;
int ret = -EFAULT;
int tryaddrmatch = 0;

/*
* Fetch the caller's info block into kernel space
*/

if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
goto out;
ifr.ifr_name[IFNAMSIZ - 1] = 0;

/* save original address for comparison */
memcpy(&sin_orig, sin, sizeof(*sin));

colon = strchr(ifr.ifr_name, ':');
if (colon)
*colon = 0;

dev_load(net, ifr.ifr_name);

switch (cmd) {
case SIOCGIFADDR: /* Get interface address */
case SIOCGIFBRDADDR: /* Get the broadcast address */
case SIOCGIFDSTADDR: /* Get the destination address */
case SIOCGIFNETMASK: /* Get the netmask for the interface */
/* Note that these ioctls will not sleep,
so that we do not impose a lock.
One day we will be forced to put shlock here (I mean SMP)
*/
tryaddrmatch = (sin_orig.sin_family == AF_INET);
memset(sin, 0, sizeof(*sin));
sin->sin_family = AF_INET;
break;

case SIOCSIFFLAGS:
ret = -EPERM;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
goto out;
break;
case SIOCSIFADDR: /* Set interface address (and family) */
case SIOCSIFBRDADDR: /* Set the broadcast address */
case SIOCSIFDSTADDR: /* Set the destination address */
case SIOCSIFNETMASK: /* Set the netmask for the interface */
ret = -EPERM;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
goto out;
ret = -EINVAL;
if (sin->sin_family != AF_INET)
goto out;
break;
default:
ret = -EINVAL;
goto out;
}

rtnl_lock();

ret = -ENODEV;
dev = __dev_get_by_name(net, ifr.ifr_name); //通过名称name(如eth0)获取设备dev
if (!dev)
goto done;

if (colon)
*colon = ':';

in_dev = __in_dev_get_rtnl(dev); //通过struct net_device获取其成员struct in_device结构体
if (in_dev) {
if (tryaddrmatch) {
/* Matthias Andree */
/* compare label and address (4.4BSD style) */
/* note: we only do this for a limited set of ioctls
and only if the original address family was AF_INET.
This is checked above. */
for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; //遍历该设备上的地址
ifap = &ifa->ifa_next) {
if (!strcmp(ifr.ifr_name, ifa->ifa_label) && //名称相同(如eth0)
sin_orig.sin_addr.s_addr ==
ifa->ifa_local) { //本地地址相同
break; /* found */
}
}
}
/* we didn't get a match, maybe the application is
4.3BSD-style and passed in junk so we fall back to
comparing just the label */
if (!ifa) { //表示不匹配
for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
ifap = &ifa->ifa_next)
if (!strcmp(ifr.ifr_name, ifa->ifa_label)) //只比较标签是否相同
break;
}
}

ret = -EADDRNOTAVAIL;
if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS)
goto done;

switch (cmd) {
case SIOCGIFADDR: /* Get interface address */
sin->sin_addr.s_addr = ifa->ifa_local;
goto rarok;

case SIOCGIFBRDADDR: /* Get the broadcast address */
sin->sin_addr.s_addr = ifa->ifa_broadcast;
goto rarok;

case SIOCGIFDSTADDR: /* Get the destination address */
sin->sin_addr.s_addr = ifa->ifa_address;
goto rarok;

case SIOCGIFNETMASK: /* Get the netmask for the interface */
sin->sin_addr.s_addr = ifa->ifa_mask;
goto rarok;

case SIOCSIFFLAGS: //将调用这里 SIOCSIFFLAGS
if (colon) {
ret = -EADDRNOTAVAIL;
if (!ifa)
break;
ret = 0;
if (!(ifr.ifr_flags & IFF_UP))
inet_del_ifa(in_dev, ifap, 1);
break;
}
ret = dev_change_flags(dev, ifr.ifr_flags); //设备改变标识
break;

case SIOCSIFADDR: /* Set interface address (and family) */ //到这里,设置地址
ret = -EINVAL;
if (inet_abc_len(sin->sin_addr.s_addr) < 0)
break;

if (!ifa) {
ret = -ENOBUFS;
ifa = inet_alloc_ifa();
if (!ifa)
break;
INIT_HLIST_NODE(&ifa->hash);
if (colon)
memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
else
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
} else {
ret = 0;
if (ifa->ifa_local == sin->sin_addr.s_addr) //本地地址相同就退出
break;
inet_del_ifa(in_dev, ifap, 0);
ifa->ifa_broadcast = 0;
ifa->ifa_scope = 0;
}

ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr; //初始化本地地址

if (!(dev->flags & IFF_POINTOPOINT)) { //类型不为point to point
ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address);
ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
if ((dev->flags & IFF_BROADCAST) &&
ifa->ifa_prefixlen < 31)
ifa->ifa_broadcast = ifa->ifa_address |
~ifa->ifa_mask;
} else {
ifa->ifa_prefixlen = 32;
ifa->ifa_mask = inet_make_mask(32);
}
set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); //更新时间
ret = inet_set_ifa(dev, ifa); //重要
break;

case SIOCSIFBRDADDR: /* Set the broadcast address */
ret = 0;
if (ifa->ifa_broadcast != sin->sin_addr.s_addr) {
inet_del_ifa(in_dev, ifap, 0);
ifa->ifa_broadcast = sin->sin_addr.s_addr;
inet_insert_ifa(ifa);
}
break;

case SIOCSIFDSTADDR: /* Set the destination address */
ret = 0;
if (ifa->ifa_address == sin->sin_addr.s_addr)
break;
ret = -EINVAL;
if (inet_abc_len(sin->sin_addr.s_addr) < 0)
break;
ret = 0;
inet_del_ifa(in_dev, ifap, 0);
ifa->ifa_address = sin->sin_addr.s_addr;
inet_insert_ifa(ifa); //设置地址
break;

case SIOCSIFNETMASK: /* Set the netmask for the interface */

/*
* The mask we set must be legal.
*/
ret = -EINVAL;
if (bad_mask(sin->sin_addr.s_addr, 0))
break;
ret = 0;
if (ifa->ifa_mask != sin->sin_addr.s_addr) {
__be32 old_mask = ifa->ifa_mask;
inet_del_ifa(in_dev, ifap, 0);
ifa->ifa_mask = sin->sin_addr.s_addr;
ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask);

/* See if current broadcast address matches
* with current netmask, then recalculate
* the broadcast address. Otherwise it's a
* funny address, so don't touch it since
* the user seems to know what (s)he's doing...
*/
if ((dev->flags & IFF_BROADCAST) &&
(ifa->ifa_prefixlen < 31) &&
(ifa->ifa_broadcast ==
(ifa->ifa_local|~old_mask))) {
ifa->ifa_broadcast = (ifa->ifa_local |
~sin->sin_addr.s_addr);
}
inet_insert_ifa(ifa);
}
break;
}
done:
rtnl_unlock();
out:
return ret;
rarok:
rtnl_unlock();
ret = copy_to_user(arg, &ifr, sizeof(struct ifreq)) ? -EFAULT : 0;
goto out;
}

在 devinet_ioctl涉及到三个重要的接口函数,分别如下

a. __dev_get_by_name() 通过接口名称(如eth0)获取其设备

dev = __dev_get_by_name(net, ifr.ifr_name); //通过名称name(如eth0)获取设备dev

struct net_device *__dev_get_by_name(struct net *net, const char *name)
{
struct net_device *dev;
struct hlist_head *head = dev_name_hash(net, name); //通过设备名称计算hash值,从而获取链表头指针

hlist_for_each_entry(dev, head, name_hlist)
if (!strncmp(dev->name, name, IFNAMSIZ)) //名称是否相同,如eth0
return dev;

return NULL;
}
EXPORT_SYMBOL(__dev_get_by_name);

static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
{
unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));

return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
}

b. __in_dev_get_rtnl() 通过设备获取struct in_device结构体

in_dev = __in_dev_get_rtnl(dev); //通过struct net_device获取其成员struct in_device结构体

static inline struct in_device *__in_dev_get_rtnl(const struct net_device *dev)
{
return rtnl_dereference(dev->ip_ptr);
}

c. 通过命令SIOCSIFADDR,知将调用inet_set_ifa()接口函数

static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
{
struct in_device *in_dev = __in_dev_get_rtnl(dev);

ASSERT_RTNL();

if (!in_dev) {
inet_free_ifa(ifa); //释放其接口
return -ENOBUFS;
}
ipv4_devconf_setall(in_dev);
if (ifa->ifa_dev != in_dev) { //接口设备不一致就强制转换
WARN_ON(ifa->ifa_dev);
in_dev_hold(in_dev);
ifa->ifa_dev = in_dev;
}
if (ipv4_is_loopback(ifa->ifa_local))
ifa->ifa_scope = RT_SCOPE_HOST;
return inet_insert_ifa(ifa); //插入接口
}
static int inet_insert_ifa(struct in_ifaddr *ifa)
{
return __inet_insert_ifa(ifa, NULL, 0);
}
static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
u32 portid)
{
struct in_device *in_dev = ifa->ifa_dev;
struct in_ifaddr *ifa1, **ifap, **last_primary;

ASSERT_RTNL();

if (!ifa->ifa_local) {
inet_free_ifa(ifa);
return 0;
}

ifa->ifa_flags &= ~IFA_F_SECONDARY;
last_primary = &in_dev->ifa_list;

for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
ifap = &ifa1->ifa_next) {
if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
ifa->ifa_scope <= ifa1->ifa_scope)
last_primary = &ifa1->ifa_next;
if (ifa1->ifa_mask == ifa->ifa_mask &&
inet_ifa_match(ifa1->ifa_address, ifa)) {
if (ifa1->ifa_local == ifa->ifa_local) {
inet_free_ifa(ifa);
return -EEXIST;
}
if (ifa1->ifa_scope != ifa->ifa_scope) {
inet_free_ifa(ifa);
return -EINVAL;
}
ifa->ifa_flags |= IFA_F_SECONDARY;
}
}

if (!(ifa->ifa_flags & IFA_F_SECONDARY)) {
net_srandom(ifa->ifa_local);
ifap = last_primary;
}

ifa->ifa_next = *ifap;
*ifap = ifa;

inet_hash_insert(dev_net(in_dev->dev), ifa); //将接口ifa加入到链表inet_addr_lst[]中

cancel_delayed_work(&check_lifetime_work);
schedule_delayed_work(&check_lifetime_work, 0);

/* Send message first, then call notifier.
Notifier will trigger FIB update, so that
listeners of netlink will know about new ifaddr */
rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid); //注意是RTM_NEWADDR,表示的地址,而RTM_NEROUTE表示的是路由
blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); //发送消息

return 0;
}

在该函数内部主要完成几个重要的功能:

a. inet_hash_insert(dev_net(in_dev->dev), ifa); //将接口ifa加入到链表inet_addr_lst[]中;

b. rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid); //注意是RTM_NEWADDR,表示的是地址,而RTM_NEROUTE表示的是路由;

关于RTM_NEWADDR命令的注册,详见第4点

c. blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); //唤醒通知链inetaddr_chain,命令为NETDEV_UP

关于该通知链的的注册部分详见第5点

4. RTM_NEWADDR 消息注册

路径:ip_rt_init()->devinet_init()

void __init devinet_init(void)
{
int i;

for (i = 0; i < IN4_ADDR_HSIZE; i++)
INIT_HLIST_HEAD(&inet_addr_lst[i]); //初始化数组链表inet_addr_lst

//注册一个网络子系统,详见该函数的定义
register_pernet_subsys(&devinet_ops);

//注册获取网络配置的接口的回调函数 SIOCGIF,并将inet_gifconf添加到指针数组 gifconf_list上
register_gifconf(PF_INET, inet_gifconf);

//注册网络设备通知链,将ip_netdev_notifier添加到netdev_chain通知连上
register_netdevice_notifier(&ip_netdev_notifier);

//调度延时工作
schedule_delayed_work(&check_lifetime_work, 0);

//注册一个rtnetlink inet_af_ops到链表rtnl_af_ops上
rtnl_af_register(&inet_af_ops);

//在注册了一些流量控制操作后如果要使用,就要在用户空间使用命令行工具配置
//然后和内核交互,告诉内核使用新的或改变一些流量控制操作(也就是改变了流量控制算法)
//下面就是为通过 rtnetlink 和用户交互而注册的函数和交互类型
rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL);
rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);
rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
inet_netconf_dump_devconf, NULL);
}

在该函数的最后部分分别注册了RTM_NEWADDR、RTM_DELADDR、RTM_GETADDR、RTM_GETNETCONF的消息处理回调函数,这里只分析RTM_NEWADDR对应的回调函数inet_rtm_newaddr():

rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL); 

static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct in_ifaddr *ifa;
struct in_ifaddr *ifa_existing;
__u32 valid_lft = INFINITY_LIFE_TIME;
__u32 prefered_lft = INFINITY_LIFE_TIME;

ASSERT_RTNL();

ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft); //初始化nlh
if (IS_ERR(ifa))
return PTR_ERR(ifa);

ifa_existing = find_matching_ifa(ifa); //判定其接口是否存在
if (!ifa_existing) { //不存在
/* It would be best to check for !NLM_F_CREATE here but
* userspace alreay relies on not having to provide this.
*/
set_ifa_lifetime(ifa, valid_lft, prefered_lft); //设置ifa的时间
return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); //设置接口
} else { //存在
inet_free_ifa(ifa); //到这里表示存在,释放ifa接口

if (nlh->nlmsg_flags & NLM_F_EXCL ||
!(nlh->nlmsg_flags & NLM_F_REPLACE))
return -EEXIST;
ifa = ifa_existing;
set_ifa_lifetime(ifa, valid_lft, prefered_lft);
cancel_delayed_work(&check_lifetime_work);
schedule_delayed_work(&check_lifetime_work, 0);
rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); //发送 RTM_NEWADDR 消息
blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); //唤醒通知链
}
return 0;
}

在该函数内部最后执行存在两种情况,第一是调用__inet_insert_ifa()设置其接口,第二是继续调用RTM_NEWADDR和通知链inetaddr_chain(关于该链表详见第5),这里又是回调函数了....(虽然是回调函数,但因为会遍历地址,地址是会结束的,所以不会总嵌套下去)

 

5. inetaddr_chain通知链接收消息处理

在第3.c中最后将通过 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa)发送消息,而该消息接收部分的处理如下

void __init ip_fib_init(void)
{
//注册路由接口处理(新建、删除、获取路由)
rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);

register_pernet_subsys(&fib_net_ops); //RT_TABLE_LOCAL、RT_TABLE_MAIN路由表的初始化、proc文件系统给的初始化
register_netdevice_notifier(&fib_netdev_notifier); //fib网络设备的通知链netdev_chain注册
register_inetaddr_notifier(&fib_inetaddr_notifier); //fib inet地址通知链inetaddr_chain注册

fib_trie_init(); //fn_alias_kmem、trie_leaf_kmem TR-C算法内存初始化
}

在ip_fib_init()函数内部主要完成的功能包括:

a. RTM_NEWROUTE、RTM_DELROUTE、RTM_GETROUTE路由消息的注册

  //注册路由接口处理(新建、删除、获取路由)
rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);

在该内部会用到inet_rtm_newroute接口函数,具体下面会分析到

b. RT_TABLE_LOCAL、RT_TABLE_MAIN路由表的初始化、proc文件系统给的初始化

register_pernet_subsys(&fib_net_ops);

static struct pernet_operations fib_net_ops = {
.init = fib_net_init,
.exit = fib_net_exit,
};

static int __net_init fib_net_init(struct net *net)
{
int error;

#ifdef CONFIG_IP_ROUTE_CLASSID
net->ipv4.fib_num_tclassid_users = 0;
#endif
error = ip_fib_net_init(net); //fib表初始化,包括RT_TABLE_LOCAL、RT_TABLE_MAIN对应的链表空间分配
if (error < 0)
goto out;
error = nl_fib_lookup_init(net); //netlink fib查找初始化
if (error < 0)
goto out_nlfl;
error = fib_proc_init(net); //fib proc文件系统初始化
if (error < 0)
goto out_proc;
out:
return error;

out_proc:
nl_fib_lookup_exit(net);
out_nlfl:
ip_fib_net_exit(net);
goto out;
}

c. fib网络设备的通知链netdev_chain注册

register_netdevice_notifier(&fib_netdev_notifier); //fib网络设备的通知链netdev_chain注册

int register_netdevice_notifier(struct notifier_block *nb)
{
//...
err = raw_notifier_chain_register(&netdev_chain, nb);
//...
}
EXPORT_SYMBOL(register_netdevice_notifier);

 

d. fib inet地址通知链inetaddr_chain注册

register_inetaddr_notifier(&fib_inetaddr_notifier); //fib inet地址通知链inetaddr_chain注册

int register_inetaddr_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&inetaddr_chain, nb);
}

e. fib_trie_init()

fib_trie_init(); //fn_alias_kmem、trie_leaf_kmem TR-C算法内存初始化

void __init fib_trie_init(void)
{
fn_alias_kmem = kmem_cache_create("ip_fib_alias",
sizeof(struct fib_alias),
0, SLAB_PANIC, NULL);

trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
max(sizeof(struct leaf),
sizeof(struct leaf_info)),
0, SLAB_PANIC, NULL);
}

到这里已经分析了ip_fib_init()函数内部的各个模块,现在回头再来看下第5.点需要分析的问题,blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa)的处理模块,即fib_netdev_notifier函数

register_netdevice_notifier(&fib_netdev_notifier); //fib网络设备的通知链netdev_chain注册

static struct notifier_block fib_netdev_notifier = {
.notifier_call = fib_netdev_event,
};

static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = ptr;
struct in_device *in_dev;
struct net *net = dev_net(dev);

if (event == NETDEV_UNREGISTER) {
fib_disable_ip(dev, 2);
rt_flush_dev(dev);
return NOTIFY_DONE;
}

in_dev = __in_dev_get_rtnl(dev);
if (!in_dev)
return NOTIFY_DONE;

switch (event) {
case NETDEV_UP: //将调用该事件
for_ifa(in_dev) {
fib_add_ifaddr(ifa);
} endfor_ifa(in_dev);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
fib_sync_up(dev);
#endif
atomic_inc(&net->ipv4.dev_addr_genid);
rt_cache_flush(net);
break;
case NETDEV_DOWN:
fib_disable_ip(dev, 0);
break;
case NETDEV_CHANGEMTU:
case NETDEV_CHANGE:
rt_cache_flush(net);
break;
}
return NOTIFY_DONE;
}

本次的事件为NETDEV_UP,所以会调用上面的函数fib_add_ifaddr()

void fib_add_ifaddr(struct in_ifaddr *ifa)
{
struct in_device *in_dev = ifa->ifa_dev;
struct net_device *dev = in_dev->dev;
struct in_ifaddr *prim = ifa;
__be32 mask = ifa->ifa_mask;
__be32 addr = ifa->ifa_local;
__be32 prefix = ifa->ifa_address & mask;

if (ifa->ifa_flags & IFA_F_SECONDARY) {
prim = inet_ifa_byprefix(in_dev, prefix, mask);
if (prim == NULL) {
pr_warn("%s: bug: prim == NULL\n", __func__);
return;
}
}

fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); //将调用RTM_NEWROUTE-RTN_LOCAL

if (!(dev->flags & IFF_UP))
return;

/* Add broadcast address, if it is explicitly assigned. */
if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);

if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
(prefix != addr || ifa->ifa_prefixlen < 32)) {
fib_magic(RTM_NEWROUTE,
dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
prefix, ifa->ifa_prefixlen, prim);

/* Add network specific broadcasts, when it takes a sense */
if (ifa->ifa_prefixlen < 31) {
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
32, prim);
}
}
}

在该函数内部将调用 如下函数

fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); //将调用RTM_NEWROUTE-RTN_LOCAL

static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
{
struct net *net = dev_net(ifa->ifa_dev->dev);
struct fib_table *tb;
struct fib_config cfg = {
.fc_protocol = RTPROT_KERNEL,
.fc_type = type,
.fc_dst = dst,
.fc_dst_len = dst_len,
.fc_prefsrc = ifa->ifa_local,
.fc_oif = ifa->ifa_dev->dev->ifindex,
.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
.fc_nlinfo = {
.nl_net = net,
},
};

if (type == RTN_UNICAST)
tb = fib_new_table(net, RT_TABLE_MAIN);
else
tb = fib_new_table(net, RT_TABLE_LOCAL);

if (tb == NULL)
return;

cfg.fc_table = tb->tb_id;

if (type != RTN_LOCAL)
cfg.fc_scope = RT_SCOPE_LINK;
else
cfg.fc_scope = RT_SCOPE_HOST;

if (cmd == RTM_NEWROUTE)
fib_table_insert(tb, &cfg);
else
fib_table_delete(tb, &cfg);
}

该函数先调用fib_new_table()查找fib_table表,然后根据命令类型是添加或删除路由,我们这里是添加路由,所以会调用fib_table_insert()接口函数(详见第6点),先来分析下fib_new_table()

tb = fib_new_table(net, cfg.fc_table); //新建一个 fid_table表

struct fib_table *fib_new_table(struct net *net, u32 id)
{
struct fib_table *tb;
unsigned int h;

if (id == 0)
id = RT_TABLE_MAIN;
tb = fib_get_table(net, id); //检索tb是否被加入到id对应的链表(如RT_TABLE_LOCAL链表)上,被加入就直接退出,否则将执行 fib_trie_table
if (tb)
return tb;

tb = fib_trie_table(id); //内存申请一个 fib_table
if (!tb)
return NULL;

switch (id) {
case RT_TABLE_LOCAL:
net->ipv4.fib_local = tb;
break;

case RT_TABLE_MAIN:
net->ipv4.fib_main = tb;
break;

case RT_TABLE_DEFAULT:
net->ipv4.fib_default = tb;
break;

default:
break;
}

h = id & (FIB_TABLE_HASHSZ - 1);
hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);//将tb(struct fib_table *tb)添加到其链表上
return tb;
}

//其中fib_get_table源码如下
//通过形参id,匹配hash链表,成功就返回tb,否则NULL
struct fib_table *fib_get_table(struct net *net, u32 id)
{
struct fib_table *tb;
struct hlist_head *head;
unsigned int h;

if (id == 0)
id = RT_TABLE_MAIN;
h = id & (FIB_TABLE_HASHSZ - 1); //h = id & 0xff

rcu_read_lock();
head = &net->ipv4.fib_table_hash[h]; //看下这里是什么时候赋值的,在 fib_new_table 函数内部初始化链表的
hlist_for_each_entry_rcu(tb, head, tb_hlist) { //遍历 net->ipv4.fib_table_hash 链表,寻找匹配成功的路由表id
if (tb->tb_id == id) {
rcu_read_unlock();
return tb;
}
}
rcu_read_unlock();
return NULL;
}

//fib_trie_table函数如下
struct fib_table *fib_trie_table(u32 id)
{
struct fib_table *tb;
struct trie *t;

tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
GFP_KERNEL);
if (tb == NULL)
return NULL;

tb->tb_id = id;
tb->tb_default = -1;
tb->tb_num_default = 0;

t = (struct trie *) tb->tb_data;
memset(t, 0, sizeof(*t));

return tb;
}

 

6. fib_table_insert() fib_table路由表添加

int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
{
struct trie *t = (struct trie *) tb->tb_data;
struct fib_alias *fa, *new_fa;
struct list_head *fa_head = NULL;
struct fib_info *fi;
int plen = cfg->fc_dst_len;
u8 tos = cfg->fc_tos;
u32 key, mask;
int err;
struct leaf *l;

if (plen > 32)
return -EINVAL;

key = ntohl(cfg->fc_dst);

pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);

mask = ntohl(inet_make_mask(plen));

if (key & ~mask)
return -EINVAL;

key = key & mask;

fi = fib_create_info(cfg); //分配一个struct fib_info结构体
if (IS_ERR(fi)) {
err = PTR_ERR(fi);
goto err;
}

l = fib_find_node(t, key); //通过关键字key查找leaf
fa = NULL;

if (l) { //l为真表示叶子存在
fa_head = get_fa_head(l, plen); //通过leaf->leaf_info->fa_alias获取其链表头
fa = fib_find_alias(fa_head, tos, fi->fib_priority); //通过表头fa_head遍历是否存在相同的fa
}

/* Now fa, if non-NULL, points to the first fib alias
* with the same keys [prefix,tos,priority], if such key already
* exists or to the node before which we will insert new one.
*
* If fa is NULL, we will need to allocate a new one and
* insert to the head of f.
*
* If f is NULL, no fib node matched the destination key
* and we need to allocate a new one of those as well.
*/

if (fa && fa->fa_tos == tos &&
fa->fa_info->fib_priority == fi->fib_priority) { //表明存在相同的fa
struct fib_alias *fa_first, *fa_match;

err = -EEXIST;
if (cfg->fc_nlflags & NLM_F_EXCL)
goto out;

/* We have 2 goals:
* 1. Find exact match for type, scope, fib_info to avoid
* duplicate routes
* 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
*/
fa_match = NULL;
fa_first = fa;
fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
list_for_each_entry_continue(fa, fa_head, fa_list) {
if (fa->fa_tos != tos)
break;
if (fa->fa_info->fib_priority != fi->fib_priority)
break;
if (fa->fa_type == cfg->fc_type &&
fa->fa_info == fi) {
fa_match = fa;
break;
}
}

if (cfg->fc_nlflags & NLM_F_REPLACE) { //存在,替换原来的
struct fib_info *fi_drop;
u8 state;

fa = fa_first;
if (fa_match) {
if (fa == fa_match)
err = 0;
goto out; //上面匹配成功就直接退出,否则要新建一个new_fa
}
err = -ENOBUFS;
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
if (new_fa == NULL)
goto out;

fi_drop = fa->fa_info;
new_fa->fa_tos = fa->fa_tos;
new_fa->fa_info = fi;
new_fa->fa_type = cfg->fc_type;
state = fa->fa_state;
new_fa->fa_state = state & ~FA_S_ACCESSED;

list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
alias_free_mem_rcu(fa);

fib_release_info(fi_drop);
if (state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);

goto succeeded;
}
/* Error if we find a perfect match which
* uses the same scope, type, and nexthop
* information.
*/
if (fa_match) //匹配成功就退出
goto out;

if (!(cfg->fc_nlflags & NLM_F_APPEND))
fa = fa_first;
}
err = -ENOENT;
if (!(cfg->fc_nlflags & NLM_F_CREATE))
goto out;

err = -ENOBUFS;
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); //到这里表明上面没有找到相同的fa,需重新申请一个新的
if (new_fa == NULL)
goto out;

//初始化fib_alias结构体
new_fa->fa_info = fi; //绑定上面分配的fi(fib_info)
//关键字绑定
new_fa->fa_tos = tos;
new_fa->fa_type = cfg->fc_type;
new_fa->fa_state = 0;
/*
* Insert new entry to the list.
*/

if (!fa_head) { //为NULL,表明是第一次执行
fa_head = fib_insert_node(t, key, plen); //插入一个节点,内部的实现还未理顺,待分析中,核心部分!!!
if (unlikely(!fa_head)) {
err = -ENOMEM;
goto out_free_new_fa;
}
}

if (!plen)
tb->tb_num_default++;

list_add_tail_rcu(&new_fa->fa_list,
(fa ? &fa->fa_list : fa_head));

rt_cache_flush(cfg->fc_nlinfo.nl_net);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
succeeded:
return 0;

out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
fib_release_info(fi);
err:
return err;
}

在其函数内部有两个非常重要函数要分析

a. fa_head = fib_insert_node(t, key, plen); //插入一个节点,内部的实现还未理顺(包括几个重要的函数:tkey_sub_equals() tkey_extract_bits()  tnode_get_child()),待分析中,核心部分!!!

static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
{
int pos, newpos;
struct tnode *tp = NULL, *tn = NULL;
struct rt_trie_node *n;
struct leaf *l;
int missbit;
struct list_head *fa_head = NULL;
struct leaf_info *li;
t_key cindex;

pos = 0;
n = rtnl_dereference(t->trie);

/* If we point to NULL, stop. Either the tree is empty and we should
* just put a new leaf in if, or we have reached an empty child slot,
* and we should just put our new leaf in that.
* If we point to a T_TNODE, check if it matches our key. Note that
* a T_TNODE might be skipping any number of bits - its 'pos' need
* not be the parent's 'pos'+'bits'!
*
* If it does match the current key, get pos/bits from it, extract
* the index from our key, push the T_TNODE and walk the tree.
*
* If it doesn't, we have to replace it with a new T_TNODE.
*
* If we point to a T_LEAF, it might or might not have the same key
* as we do. If it does, just change the value, update the T_LEAF's
* value, and return it.
* If it doesn't, we need to replace it with a T_TNODE.
*/

while (n != NULL && NODE_TYPE(n) == T_TNODE) {
tn = (struct tnode *) n;

check_tnode(tn);

if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
tp = tn;
pos = tn->pos + tn->bits;
n = tnode_get_child(tn,
tkey_extract_bits(key,
tn->pos,
tn->bits));

BUG_ON(n && node_parent(n) != tn);
} else
break;
}

/*
* n ----> NULL, LEAF or TNODE
*
* tp is n's (parent) ----> NULL or TNODE
*/

BUG_ON(tp && IS_LEAF(tp));

/* Case 1: n is a leaf. Compare prefixes */

if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
l = (struct leaf *) n;
li = leaf_info_new(plen); //申请一个新的leaf_info

if (!li)
return NULL;

fa_head = &li->falh;
insert_leaf_info(&l->list, li); //将li(leaf_info)添加到l->list(leaf)链表上
goto done;
}
l = leaf_new(); //新建一个leaf

if (!l)
return NULL;

l->key = key;
li = leaf_info_new(plen); //新建一个leaf_info

if (!li) {
free_leaf(l);
return NULL;
}

fa_head = &li->falh;
insert_leaf_info(&l->list, li); //将li(leaf_info)添加到l->list(leaf)上

if (t->trie && n == NULL) {
/* Case 2: n is NULL, and will just insert a new leaf */

node_set_parent((struct rt_trie_node *)l, tp); //设置当前leaf的父节点

cindex = tkey_extract_bits(key, tp->pos, tp->bits); //计算索引
put_child(tp, cindex, (struct rt_trie_node *)l); //通过索引,将leaf添加到tp节点上,其实就是将leaf添加到 tn->child[i]
} else {
/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
/*
* Add a new tnode here
* first tnode need some special handling
*/

if (tp)
pos = tp->pos+tp->bits;
else
pos = 0;

if (n) {
newpos = tkey_mismatch(key, pos, n->key);
tn = tnode_new(n->key, newpos, 1);
} else {
newpos = 0;
tn = tnode_new(key, newpos, 1); /* First tnode */
}

if (!tn) {
free_leaf_info(li);
free_leaf(l);
return NULL;
}

node_set_parent((struct rt_trie_node *)tn, tp);

missbit = tkey_extract_bits(key, newpos, 1);
put_child(tn, missbit, (struct rt_trie_node *)l);
put_child(tn, 1-missbit, n);

if (tp) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
put_child(tp, cindex, (struct rt_trie_node *)tn);
} else {
rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tp = tn;
}
}

if (tp && tp->pos + tp->bits > 32)
pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
tp, tp->pos, tp->bits, key, plen);

/* Rebalance the trie */

trie_rebalance(t, tp);
done:
return fa_head;
}

b. 路由事件RTM_NEWROUTE发送之后的接收处理

事件发送:

rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
&cfg->fc_nlinfo, 0)

事件接收(在第5.c中有提到):

  //注册路由接口处理(新建、删除、获取路由)
rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);

所以这里会调用 inet_rtm_newroute()接口函数,并将fib_config路由信息添加到路由表上

static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct fib_config cfg;
struct fib_table *tb;
int err;

err = rtm_to_fib_config(net, skb, nlh, &cfg); //cfg(fib_cofig结构体)初始化
if (err < 0)
goto errout;

tb = fib_new_table(net, cfg.fc_table); //新建一个 fid_table表
if (tb == NULL) {
err = -ENOBUFS;
goto errout;
}

err = fib_table_insert(tb, &cfg); //将cfg添加到fib_table表上
errout:
return err;
}

特别注意,在fib_table_insert()函数内部会继续调用如下的函数

  rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);

嵌套调用,这里与之前分析的地址嵌套调用是一致的。

7. 总结

      本博客涉及到一些比较核心的功能代码还未具体分析(如通过pos+bits计算地址索引),不过针对应用层ifconfig所触发的内核内部流程及机制已分析的很明确,这正是我需要的,框架流程弄明白了,具体细节就只是时间问题!

又到深夜,我家宝宝出生第19天,Happy & life.

 

问题处理:

这里有个疑问,相应的消息将通过RTM_NEWROUTE命令进行发送,而接受该消息的处理函数是前面分析到的 inet_rtm_newroute()函数,难道这里会执行多次嵌套调用???-------已经解决