2.3 协议栈注册

内核实现了网络层的ip协议,也实现了传输层的tcp协议和udp协议。这些协议对应的实现函数分别是​ip_rcv(),tcp_v4_rcv()和udp_rcv()​。和我们平时写代码的方式不一样的是,内核是通过注册的方式来实现的。Linux内核中的fs_initcall和subsys_initcall类似,也是初始化模块的入口。fs_initcall调用inet_init后开始网络协议栈注册。通过inet_init,将这些函数注册到了inet_protos(传输层协议)和ptype_base(网络/链路层协议)数据结构中了。如下图:

Linux数据报文接收发送总结6_传输层

相关代码如下

//file: net/ipv4/af_inet.c

static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};


/* Upon startup we insert all the elements in inetsw_array[] into
* the linked list inetsw.
*/
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},

{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.flags = INET_PROTOSW_PERMANENT,
},

{
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
.ops = &inet_sockraw_ops,
.flags = INET_PROTOSW_REUSE,
},

{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.flags = INET_PROTOSW_REUSE,
}
};


static struct packet_type ip_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
};

static const struct net_protocol tcp_protocol = {
.early_demux = tcp_v4_early_demux,
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
.no_policy = 1,
.netns_ok = 1,
.icmp_strict_tag_validation = 1,
};

static const struct net_protocol udp_protocol = {
.early_demux = udp_v4_early_demux,
.handler = udp_rcv,
.err_handler = udp_err,
.no_policy = 1,
.netns_ok = 1,
};

static const struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
.err_handler = icmp_err,
.no_policy = 1,
.netns_ok = 1,
};


static int __init inet_init(void)
{
struct inet_protosw *q;
struct list_head *r;
int rc = -EINVAL;

sock_skb_cb_check_size(sizeof(struct inet_skb_parm));

rc = proto_register(&tcp_prot, 1);
if (rc)
goto out;

rc = proto_register(&udp_prot, 1);
if (rc)
goto out_unregister_tcp_proto;

rc = proto_register(&raw_prot, 1);
if (rc)
goto out_unregister_udp_proto;

rc = proto_register(&ping_prot, 1);
if (rc)
goto out_unregister_raw_proto;

/*
* Tell SOCKET that we are alive...
*/

(void)sock_register(&inet_family_ops); // 协议族注册, socket函数的第一个参数

#ifdef CONFIG_SYSCTL
ip_static_sysctl_init();
#endif

/*
* Add all the base protocols.
*/

if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) //协议注册,ip层协议解析后,再向上解析传输层调用
pr_crit("%s: Cannot add ICMP protocol\n", __func__);
if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
pr_crit("%s: Cannot add UDP protocol\n", __func__);
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif

/* Register the socket-side information for inet_create. */
for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
INIT_LIST_HEAD(r);

for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) // 类型注册,对应socket中的第二个参数
inet_register_protosw(q);

/*
* Set the ARP module up
*/

arp_init();

/*
* Set the IP module up
*/

ip_init();

tcp_v4_init();

/* Setup TCP slab cache for open requests. */
tcp_init();

/* Setup UDP memory threshold */
udp_init();

/* Add UDP-Lite (RFC 3828) */
udplite4_register();

ping_init();

/*
* Set the ICMP layer up
*/

if (icmp_init() < 0)
panic("Failed to create the ICMP control socket.\n");

/*
* Initialise the multicast router
*/
#if defined(CONFIG_IP_MROUTE)
if (ip_mr_init())
pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif

if (init_inet_pernet_ops())
pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
/*
* Initialise per-cpu ipv4 mibs
*/

if (init_ipv4_mibs())
pr_crit("%s: Cannot init ipv4 mibs\n", __func__);

ipv4_proc_init();

ipfrag_init();

dev_add_pack(&ip_packet_type); // 注册到ptype_base, 接收报文时,根据协议进行相应的处理函数

ip_tunnel_core_init();

rc = 0;
out:
return rc;
out_unregister_raw_proto:
proto_unregister(&raw_prot);
out_unregister_udp_proto:
proto_unregister(&udp_prot);
out_unregister_tcp_proto:
proto_unregister(&tcp_prot);
goto out;
}

fs_initcall(inet_init);

proto_register注册函数,将对应协议加到proto_list链表中。proto_list是一个全局的静态链表,inet域支持的所有协议全部在这个链表中,但这个链表在协议栈中并没有太大用途,它只是用于在/proc/net/protocols文件中输出当前系统所支持的所有协议。

inet_register_protosw注册函数,将对协议加到 inetsw 数组中,在socket函数系统调用时选择具体的协议时会用到,发送报文时会用到。

int proto_register(struct proto *prot, int alloc_slab)
{
if (alloc_slab) {
......
}

mutex_lock(&proto_list_mutex);
list_add(&prot->node, &proto_list);
assign_proto_idx(prot);
mutex_unlock(&proto_list_mutex);
return 0;
}
EXPORT_SYMBOL(proto_register);


void inet_register_protosw(struct inet_protosw *p)
{
struct list_head *lh;
struct inet_protosw *answer;
int protocol = p->protocol;
struct list_head *last_perm;

spin_lock_bh(&inetsw_lock);

if (p->type >= SOCK_MAX)
goto out_illegal;

/* If we are trying to override a permanent protocol, bail. */
last_perm = &inetsw[p->type];
list_for_each(lh, &inetsw[p->type]) {
answer = list_entry(lh, struct inet_protosw, list);
/* Check only the non-wild match. */
if ((INET_PROTOSW_PERMANENT & answer->flags) == 0)
break;
if (protocol == answer->protocol)
goto out_permanent;
last_perm = lh;
}

/* Add the new entry after the last permanent entry if any, so that
* the new entry does not override a permanent entry when matched with
* a wild-card protocol. But it is allowed to override any existing
* non-permanent entry. This means that when we remove this entry, the
* system automatically returns to the old behavior.
*/
list_add_rcu(&p->list, last_perm);
out:
spin_unlock_bh(&inetsw_lock);

return;

......
}

上面的代码中我们可以看到,udp_protocol结构体中的handler是udp_rcv,tcp_protocol结构体中的handler是tcp_v4_rcv,通过inet_add_protocol被初始化了进来。

int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol){
if (!prot->netns_ok) {
pr_err("Protocol %u is not namespace aware, cannot register.\n",
protocol);
return -EINVAL;
}

return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
NULL, prot) ? 0 : -1;
}

inet_add_protocol函数将tcp和udp对应的处理函数都注册到了 inet_protos (接收报文时,解析传输层应用)数组中了。

再看dev_add_pack(&ip_packet_type);这一行,ip_packet_type结构体中的type是协议名,func是ip_rcv函数,在dev_add_pack中会被注册到ptype_base(接收报文时,解析网络层使用)哈希表中。(上面在net_dev_init时,最多支持16个协议)

//file: net/core/dev.c
void dev_add_pack(struct packet_type *pt){
struct list_head *head = ptype_head(pt);
......
}
static inline struct list_head *ptype_head(const struct packet_type *pt){
if (pt->type == htons(ETH_P_ALL))
return &ptype_all;
else
return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
}

这里我们需要记住inet_protos记录着udp,tcp的处理函数地址,ptype_base存储着ip_rcv()函数的处理地址。后面我们会看到软中断中会通过ptype_base找到ip_rcv函数地址,进而将ip包正确地送到ip_rcv()中执行。在ip_rcv中将会通过inet_protos找到tcp或者udp的处理函数,再而把包转发给udp_rcv()或tcp_v4_rcv()函数。

扩展一下,如果看一下ip_rcv和udp_rcv等函数的代码能看到很多协议的处理过程。例如,ip_rcv中会处理netfilter和iptable过滤,如果你有很多或者很复杂的 netfilter 或 iptables 规则,这些规则都是在软中断的上下文中执行的,会加大网络延迟。再例如,udp_rcv中会判断socket接收队列是否满了。对应的相关内核参数是net.core.rmem_max和net.core.rmem_default。如果有兴趣,建议大家好好读一下inet_init这个函数的代码。