文章目录
- 从IP层接收数据包: udp_rcv()
- 查找数据包所属套接字
- 计算匹配分值: compute_score()
- 保存数据报到队列: udp_queue_rcv_skb()
- 接收数据到接收队列
- 接收数据到后备队列
- 唤醒用户态进程
UDP数据报的接收过程要分两部分来看:
- 网络层将数据报递交给UDP后,UDP的处理过程。该过程中,UDP需要接收数据包并对其进行校验,校验成功后将其放入接收队列中等待用户空间程序来读取;
- 用户空间程序调用read()等系统调用读取已经放入接收队列中的数据。
这篇笔记先来介绍第一部分。
从IP层接收数据包: udp_rcv()
在AF_INET协议族初始化时,由UDP注册给网络层的接收回调函数。
int udp_rcv(struct sk_buff *skb)
{
return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
}
@skb: 输入数据包
@udptable:已绑定端口的UDP传输控制块哈希表,将从该哈希表查找该skb属于哪个套接字
@proto:L4协议号,到这里可能是IPPROTO_UDP或者IPPROTO_UDPLITE
int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto)
{
struct sock *sk;
struct udphdr *uh;
unsigned short ulen;
struct rtable *rt = skb_rtable(skb);
__be32 saddr, daddr;
struct net *net = dev_net(skb->dev);
// 保证skb的数据区至少包含udp首部
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
goto drop; /* No space for header. */
uh = udp_hdr(skb);
ulen = ntohs(uh->len);
// skb中的数据长度不能小于UDP首部指示的数据包长度,该检查确保数据包是完整的
if (ulen > skb->len)
goto short_packet;
if (proto == IPPROTO_UDP) {
// 1. UDP数据包长度必须大于首部长度
// 2. pskb_trim_rcum()会去掉可能的填充(UDP数据包过小,IP可能会填充),然后重新计算校验和
if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
goto short_packet;
uh = udp_hdr(skb);
}
// 校验和检查
if (udp4_csum_init(skb, uh, proto))
goto csum_error;
// 获取数据包中的源IP和目的IP地址
saddr = ip_hdr(skb)->saddr;
daddr = ip_hdr(skb)->daddr;
// 对于多播或者广播报文的处理
if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
return __udp4_lib_mcast_deliver(net, skb, uh, saddr, daddr, udptable);
// 根据报文的源端口号和目的端口号查询udptable,寻找应该接收该数据包的传输控制块
sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
if (sk != NULL) {
// 找到了处理该数据包的传输控制块,调用udp_queue_rcv_skb()接收数据包
int ret = udp_queue_rcv_skb(sk, skb);
sock_put(sk);
/* a return value > 0 means to resubmit the input, but
* it wants the return to be -protocol, or 0
*/
if (ret > 0)
return -ret;
return 0;
}
// 到这里,说明没有传输控制块接收该数据包,做些统计然后丢弃该数据包
// IPSec相关
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
nf_reset(skb);
/* No socket. Drop packet silently, if checksum is wrong */
if (udp_lib_checksum_complete(skb))
goto csum_error;
// 累计输入数据包错误统计值,并且回复端口不可达ICMP报文
UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
/*
* Hmm. We got an UDP packet to a port to which we
* don't wanna listen. Ignore it.
*/
kfree_skb(skb);
return 0;
short_packet:
LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
proto == IPPROTO_UDPLITE ? "-Lite" : "",
&saddr,
ntohs(uh->source),
ulen,
skb->len,
&daddr,
ntohs(uh->dest));
goto drop;
csum_error:
/*
* RFC1122: OK. Discards the bad packet silently (as far as
* the network is concerned, anyway) as per 4.1.3.4 (MUST).
*/
LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
proto == IPPROTO_UDPLITE ? "-Lite" : "",
&saddr,
ntohs(uh->source),
&daddr,
ntohs(uh->dest),
ulen);
drop:
UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
kfree_skb(skb);
return 0;
}
查找数据包所属套接字
如上,非常关键的一步就是根据数据报中源端口和目的端口查找应该由谁来处理该数据包。
static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
__be16 sport, __be16 dport, struct udp_table *udptable)
{
struct sock *sk;
const struct iphdr *iph = ip_hdr(skb);
// 在网络层可能已经为该数据包查询过传输控制块了,这时会将查询结果记录到skb->sk中
if (unlikely(sk = skb_steal_sock(skb)))
return sk;
else
// 之前没有查询过,继续查询
return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
iph->daddr, dport, inet_iif(skb), udptable);
}
@dif: 该数据包的输入网络设备索引
static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
__be16 sport, __be32 daddr, __be16 dport,
int dif, struct udp_table *udptable)
{
struct sock *sk, *result;
struct hlist_nulls_node *node;
// 目的端口号为哈希表的key,接收报文的目的端口相对于本端来讲就是源端口,这和bind()过程是一致的
unsigned short hnum = ntohs(dport);
unsigned int hash = udp_hashfn(net, hnum);
struct udp_hslot *hslot = &udptable->hash[hash];
int score, badness;
rcu_read_lock();
begin:
// 遍历冲突链,寻找一个最匹配的传输控制块保存到result中
result = NULL;
badness = -1;
sk_nulls_for_each_rcu(sk, node, &hslot->head) {
score = compute_score(sk, net, saddr, hnum, sport, daddr, dport, dif);
if (score > badness) {
result = sk;
badness = score;
}
}
/*
* if the nulls value we got at the end of this lookup is
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
if (get_nulls_value(node) != hash)
goto begin;
if (result) {
if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
result = NULL;
else if (unlikely(compute_score(result, net, saddr, hnum, sport,
daddr, dport, dif) < badness)) {
sock_put(result);
goto begin;
}
}
rcu_read_unlock();
return result;
}
计算匹配分值: compute_score()
该函数计算指定socket和输入参数之间的匹配分值,分值越高,表示越匹配。-1表示不匹配。
static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
unsigned short hnum, __be16 sport, __be32 daddr, __be16 dport, int dif)
{
int score = -1;
if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && !ipv6_only_sock(sk)) {
struct inet_sock *inet = inet_sk(sk);
score = (sk->sk_family == PF_INET ? 1 : 0);
if (inet->rcv_saddr) { // 如果套接字绑定了本地地址,那么必须匹配
if (inet->rcv_saddr != daddr)
return -1;
score += 2;
}
if (inet->daddr) { // 如果套接字connect了目标地址,那么必须匹配
if (inet->daddr != saddr)
return -1;
score += 2;
}
if (inet->dport) { // 如果套接字connect了目的端口,那么必须匹配
if (inet->dport != sport)
return -1;
score += 2;
}
if (sk->sk_bound_dev_if) { // 如果套接字绑定了网络设备,那么入口设备必须匹配
if (sk->sk_bound_dev_if != dif)
return -1;
score += 2;
}
}
return score;
}
这里之所以没有比较sport,是因为sk->sk_hash == hnum已经可以说明源端口是匹配的了。
保存数据报到队列: udp_queue_rcv_skb()
找到数据所属传输控制块后,会调用该函数将数据报放入接收队列等待用户态程序读取。
/* returns:
* -1: error
* 0: success
* >0: "udp encap" protocol resubmission
*
* Note that in the success and error cases, the skb is assumed to
* have either been requeued or freed.
*/
int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
int rc;
int is_udplite = IS_UDPLITE(sk);
// IPSec相关
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto drop;
nf_reset(skb);
// IPSec相关处理
if (up->encap_type) {
/*
* This is an encapsulation socket so pass the skb to
* the socket's udp_encap_rcv() hook. Otherwise, just
* fall through and pass this up the UDP socket.
* up->encap_rcv() returns the following value:
* =0 if skb was successfully passed to the encap
* handler or was discarded by it.
* >0 if skb should be passed on to UDP.
* <0 if skb should be resubmitted as proto -N
*/
/* if we're overly short, let UDP handle it */
if (skb->len > sizeof(struct udphdr) &&
up->encap_rcv != NULL) {
int ret;
ret = (*up->encap_rcv)(sk, skb);
if (ret <= 0) {
UDP_INC_STATS_BH(sock_net(sk),
UDP_MIB_INDATAGRAMS,
is_udplite);
return -ret;
}
}
/* FALLTHROUGH -- it's a UDP Packet */
}
// UDPlite相关处理
if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
/*
* MIB statistics other than incrementing the error count are
* disabled for the following two types of errors: these depend
* on the application settings, not on the functioning of the
* protocol stack as such.
*
* RFC 3828 here recommends (sec 3.3): "There should also be a
* way ... to ... at least let the receiving application block
* delivery of packets with coverage values less than a value
* provided by the application."
*/
if (up->pcrlen == 0) { /* full coverage was set */
LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage "
"%d while full coverage %d requested\n",
UDP_SKB_CB(skb)->cscov, skb->len);
goto drop;
}
/* The next case involves violating the min. coverage requested
* by the receiver. This is subtle: if receiver wants x and x is
* greater than the buffersize/MTU then receiver will complain
* that it wants x while sender emits packets of smaller size y.
* Therefore the above ...()->partial_cov statement is essential.
*/
if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
LIMIT_NETDEBUG(KERN_WARNING
"UDPLITE: coverage %d too small, need min %d\n",
UDP_SKB_CB(skb)->cscov, up->pcrlen);
goto drop;
}
}
// 如果设置了套接口过滤器,需要保证传给过滤器的数据包一定是校验通过的
if (sk->sk_filter) {
if (udp_lib_checksum_complete(skb))
goto drop;
}
rc = 0;
// 锁定socket
bh_lock_sock(sk);
if (!sock_owned_by_user(sk))
// 当前没有用户空间程序正在从接收队列接收数据,那么直接将skb放入接收队列中
rc = __udp_queue_rcv_skb(sk, skb);
else
// 如果接收队列已经被锁定,暂时将数据放入到后备队列中,后备队列中的数据会
// 在release_sock()中被转移到接收队列中
sk_add_backlog(sk, skb);
bh_unlock_sock(sk);
return rc;
drop:
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
kfree_skb(skb);
return -1;
}
接收数据到接收队列
static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int is_udplite = IS_UDPLITE(sk);
int rc;
// 调用sock_queue_rcv_skb()接收
if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) {
/* Note that an ENOMEM error is charged twice */
if (rc == -ENOMEM) {
// 如果由于内存问题导致数据包接收失败,进行统计
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, is_udplite);
atomic_inc(&sk->sk_drops);
}
goto drop;
}
return 0;
drop:
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
kfree_skb(skb);
return -1;
}
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int err = 0;
int skb_len;
// 如果接收该数据包后,占用内存过大,则接收失败
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned)sk->sk_rcvbuf) {
err = -ENOMEM;
goto out;
}
// 对于设置了套接字过滤器的调用其过滤器回调,过滤失败直接返回失败
err = sk_filter(sk, skb);
if (err)
goto out;
// 进行内存相关的统计,如果内存不足或者超过了接收缓存上限,则接收失败
if (!sk_rmem_schedule(sk, skb->truesize)) {
err = -ENOBUFS;
goto out;
}
skb->dev = NULL;
// 输入数据包由该套接字认领
skb_set_owner_r(skb, sk);
/* Cache the SKB length before we tack it onto the receive
* queue. Once it is added it no longer belongs to us and
* may be freed by other threads of control pulling packets
* from the queue.
*/
skb_len = skb->len;
// 将skb放入到接收队列默认
skb_queue_tail(&sk->sk_receive_queue, skb);
// 通知正在等待数据可读的应用程序
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk, skb_len);
out:
return err;
}
接收数据到后备队列
在下半部接收时,如果传输控制块已经被进程锁定,那么会先将数据放入到后备队列中,等进程释放传输控制块时再进行处理,这种设计可以使得软中断能够尽快的结束。
/* The per-socket spinlock must be held here. */
// 调用该函数时,要确保已经使用自旋锁sk_lock.slock
static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
// 将skb放入后备队列的末尾
if (!sk->sk_backlog.tail) {
sk->sk_backlog.head = sk->sk_backlog.tail = skb;
} else {
sk->sk_backlog.tail->next = skb;
sk->sk_backlog.tail = skb;
}
skb->next = NULL;
}
唤醒用户态进程
将数据放入接收队列后,需要唤醒那些正在等待数据可读的进程,这是通过上面的sk->sk_data_ready()回调实现的,对于UDP,该函数就是sock_def_readable()。
static void sock_def_readable(struct sock *sk, int len)
{
// 先获取读锁
read_lock(&sk->sk_callback_lock);
// 如果有正在阻塞的进程,唤醒它们
if (sk_has_sleeper(sk))
wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN | POLLRDNORM | POLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
read_unlock(&sk->sk_callback_lock);
}
static inline int sk_has_sleeper(struct sock *sk)
{
/*
* We need to be sure we are in sync with the
* add_wait_queue modifications to the wait queue.
*
* This memory barrier is paired in the sock_poll_wait.
*/
smp_mb__after_lock();
// block的进程都阻塞在了sk->sk_sleep等待队列上
return sk->sk_sleep && waitqueue_active(sk->sk_sleep);
}