客户端发送的SYN请求到达服务器的网卡后,进入服务器操作系统的网络协议栈,经过链路层和网络层的处理后,抵达TCP协议的入口函数。TCPv4的入口函数是tcp_v4_rcv,TCPv6的入口函数是tcp_v6_rcv。下面对tcp_v4_rcv进行分析:
代码解析:
1. 1961 int tcp_v4_rcv(struct sk_buff *skb)
2. 1962 {
3. 1963 const struct iphdr *iph;
4. 1964 const struct tcphdr *th;
5. 1965 struct sock *sk;
6. 1966 int ret;
7. ...
8. 1975 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
9. 1976 goto discard_it;
10. 1977
11. 1978 th = tcp_hdr(skb);
12. 1979
13. 1980 if (th->doff < sizeof(struct tcphdr) / 4) //检查TCP头长度是否大于最小值
14. 1981 goto bad_packet;
15. 1982 if (!pskb_may_pull(skb, th->doff * 4))
16. 1983 goto discard_it;
17. 1984
18. 1985 /* An explanation is required here, I think.
19. 1986 * Packet length and doff are validated by header prediction,
20. 1987 * provided case of th->doff==0 is eliminated.
21. 1988 * So, we defer the checks. */
22. 1989 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) //校验检验和
23. 1990 goto csum_error;
24. 1991
25. 1992 th = tcp_hdr(skb);
26. 1993 iph = ip_hdr(skb);
27. 1994 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
28. 1995 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
29. 1996 skb->len - th->doff * 4);
30. 1997 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
31. 1998 TCP_SKB_CB(skb)->when = 0;
32. 1999 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
33. 2000 TCP_SKB_CB(skb)->sacked = 0;
34. 2001
35. 2002 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
36. 2003 if (!sk)
37. 2004 goto no_tcp_socket;
38. ...
39. 2023
40. 2024 bh_lock_sock_nested(sk); //加自旋锁,以防止其它软中断同时访问当前socket
41. 2025 ret = 0;
42. 2026 if (!sock_owned_by_user(sk)) {//如果没有进程锁定此socket;对于SYN此条件为真
43. ...
44. 2035 {
45. 2036 if (!tcp_prequeue(sk, skb)) //判断是否加入到prequeue中;若无进程等待使用prequeue的话则调用tcp_v4_do_rcv函数进行处理
46. 2037 ret = tcp_v4_do_rcv(sk, skb);//进入主处理函数
47. 2038 }
48. ...
49. 2043 goto discard_and_relse;
50. 2044 }
51. 2045 bh_unlock_sock(sk);
52. 2046
53. 2047 sock_put(sk);
54. 2048
55. 2049 return ret;
56. 2050
1975:pskb_may_pull的作用就是检测skb对应的主buf中是否有足够的空间来pull出len长度,如果不够就重新分配skb并将frags中的数据整合进连续空间中;SYN包则不存在frags中的数据。
1994-2000:将报文中的一些信息记录到skb的cb字段中以便使用;
2002:__inet_lookup_skb会根据报文的源/目的IP和源/目的端口等信息在tcp_hashinfo中查找连接。先查找ehash,即established连接表,对于SYN请求这次查找会失败;然后再找listening_hash,这时会找到server端在listen系统调用中加入到listening_hash表中的socket。
下面分析主处理函数tcp_v4_do_rcv:
server端socket的状态一定是TCP_LISTEN,即1835行的判断成立。tcp_v4_hnd_req用于查找一个处于“半建立”状态(即发送SYN|ACK,等待ACK)的连接:
1. 1800 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
2. 1801 {
3. 1802 struct sock *rsk;
4. ...
5. 1835 if (sk->sk_state == TCP_LISTEN) {
6. 1836 struct sock *nsk = tcp_v4_hnd_req(sk, skb); //查找request sock
7. 1837 if (!nsk)
8. 1838 goto discard;
9. 1839
10. 1840 if (nsk != sk) {
11. 1841 sock_rps_save_rxhash(nsk, skb);
12. 1842 if (tcp_child_process(sk, nsk, skb)) {
13. 1843 rsk = nsk;
14. 1844 goto reset;
15. 1845 }
16. 1846 return 0;
17. 1847 }
18. 1848 } else
19. 1849 sock_rps_save_rxhash(sk, skb);
20. 1850
21. 1851 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
22. 1852 rsk = sk;
23. 1853 goto reset;
24. 1854 }
25. 1855 return 0;
26. ...
值得注意的是,1751-1752行又进行了一次在established表中的连接查找。在tcp_v4_rcv函数的2002行的__inet_lookup_skb函数不是查过一回了吗?tcp_v4_hnd_req函数中查找是否不必要呢?很必要!因为Linux支持多处理器,考虑如下场景:sever端的机器有两个CPU,如果client先后发送了两个SYN,第一个SYN到来后由CPU0处理,先调用__inet_lookup_skb查找socket,这时只能在listen表中查找;找到之后在锁定这个socket之前CPU1收到了第二个SYN,并以迅雷不及掩耳的速度完成了三次握手并建立了established连接,然后CPU0才慢吞吞的锁定socket,调用tcp_v4_hnd_req函数,这时iinet_lookup_established函数就会查找到由CPU1建立的socket。如果没有1751-1752行的处理,在这种情况下server端的TCP就会错误的再次发送SYN|ACK并重复建立连接。上述场景发生的概率虽然很低,但并非为零。可见这个“查漏补缺”的查找的必要性。
1. 1739 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
2. 1740 {
3. 1741 struct tcphdr *th = tcp_hdr(skb);
4. 1742 const struct iphdr *iph = ip_hdr(skb);
5. 1743 struct sock *nsk;
6. 1744 struct request_sock **prev;
7. 1745 /* Find possible connection requests. */
8. 1746 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
9. 1747 iph->saddr, iph->daddr);
10. 1748 if (req) //对于第一个SYN包req一定是NULL
11. 1749 return tcp_check_req(sk, skb, req, prev, false);
12. 1750
13. 1751 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
14. 1752 th->source, iph->daddr, th->dest, inet_iif(skb));
15. 1753
16. 1754 if (nsk) { //对于第一个SYN包nsk一定是NULL
17. 1755 if (nsk->sk_state != TCP_TIME_WAIT) {
18. 1756 bh_lock_sock(nsk);
19. 1757 return nsk;
20. 1758 }
21. 1759 inet_twsk_put(inet_twsk(nsk));
22. 1760 return NULL;
23. 1761 }
24. ...
25. 1767 return sk;
26. 1768 }
通常情况下,对于第一个SYN请求tcp_v4_hnd_req会在1767行返回,即返回listen socket的指针。这样对于tcp_v4_do_rcv函数,1840行的判断会为假,代码直接跳转到1851行的tcp_rcv_state_process函数:
icsk->icsk_af_ops->conn_request指向tcp_v4_conn_request函数:
1. 5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
2. 5601 const struct tcphdr *th, unsigned int len)
3. 5602 {
4. ...
5. 5604 struct inet_connection_sock *icsk = inet_csk(sk);
6. ...
7. 5610 switch (sk->sk_state) {
8. ...
9. 5614 case TCP_LISTEN:
10. ...
11. 5621 if (th->syn) {
12. 5622 if (th->fin)
13. 5623 goto discard; //SYN包中不允许置FIN标志位
14. 5624 if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
15. 5625 return 1;
16. ...
17. 5644 kfree_skb(skb);
18. 5645 return 0;
19. 5646 }
20. ...
代码解析:
1. 1465 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
2. 1466 {
3. 1467 struct tcp_options_received tmp_opt;
4. 1468 struct request_sock *req;
5. 1469 struct inet_request_sock *ireq;
6. 1470 struct tcp_sock *tp = tcp_sk(sk);
7. 1471 struct dst_entry *dst = NULL;
8. 1472 __be32 saddr = ip_hdr(skb)->saddr;
9. 1473 __be32 daddr = ip_hdr(skb)->daddr;
10. 1474 __u32 isn = TCP_SKB_CB(skb)->when;
11. 1475 bool want_cookie = false;
12. 1476 struct flowi4 fl4;
13. 1477 struct tcp_fastopen_cookie foc = { .len = -1 };
14. 1478 struct tcp_fastopen_cookie valid_foc = { .len = -1 };
15. 1479 struct sk_buff *skb_synack;
16. 1480 int do_fastopen;
17. ...
18. 1486 /* TW buckets are converted to open requests without
19. 1487 * limitations, they conserve resources and peer is
20. 1488 * evidently real one.
21. 1489 */
22. 1490 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {//如果requeset队列已满并且SYN包没有命中TIME_WAIT socekt
23. 1491 want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); //判断是否需要发送SYN Cookie
24. 1492 if (!want_cookie) //如果不需要发送Cookie,则丢弃
25. 1493 goto drop;
26. 1494 }
27. 1495
28. 1496 /* Accept backlog is full. If we have already queued enough
29. 1497 * of warm entries in syn queue, drop request. It is better than
30. 1498 * clogging syn queue with openreqs with exponentially increasing
31. 1499 * timeout.
32. 1500 */
33. 1501 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
34. 1502 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
35. 1503 goto drop; //如果accept backlog队列已满,且未超时的request socket的数量大于1,则丢弃当前请求
36. 1504 }
37. 1505
38. 1506 req = inet_reqsk_alloc(&tcp_request_sock_ops);//申请一个request socket结构体
39. 1507 if (!req)
40. 1508 goto drop;
41. 1509
42. 1514 tcp_clear_options(&tmp_opt);
43. 1515 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
44. 1516 tmp_opt.user_mss = tp->rx_opt.user_mss;
45. 1517 tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
46. 1518
47. 1519 if (want_cookie && !tmp_opt.saw_tstamp)
48. 1520 tcp_clear_options(&tmp_opt);
49. 1521
50. 1522 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
51. 1523 tcp_openreq_init(req, &tmp_opt, skb);
52. 1524
53. 1525 ireq = inet_rsk(req);
54. 1526 ireq->loc_addr = daddr;
55. 1527 ireq->rmt_addr = saddr;
56. 1528 ireq->no_srccheck = inet_sk(sk)->transparent;
57. 1529 ireq->opt = tcp_v4_save_options(skb);
58. ...
59. 1536
60. 1537 if (want_cookie) {
61. 1538 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
62. 1539 req->cookie_ts = tmp_opt.tstamp_ok;
63. 1540 } else if (!isn) {
64. 1541 /* VJ's idea. We save last timestamp seen
65. 1542 * from the destination in peer table, when entering
66. 1543 * state TIME-WAIT, and check against it before
67. 1544 * accepting new connection request.
68. 1545 *
69. 1546 * If "isn" is not zero, this request hit alive
70. 1547 * timewait bucket, so that all the necessary checks
71. 1548 * are made in the function processing timewait state.
72. 1549 */
73. 1550 if (tmp_opt.saw_tstamp && //SYN包中携带了时间戳选项
74. 1551 tcp_death_row.sysctl_tw_recycle && //系统设置了回收TIME_WAIT socket
75. 1552 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && //查找到目的路由
76. 1553 fl4.daddr == saddr) { //路由得到的目的IP与SYN包中的源IP一致
77. 1554 if (!tcp_peer_is_proven(req, dst, true)) {//这时time_wait bucket已经超时了,只能通过时间戳验证一下新的请求是否与旧连接冲突
78. 1555 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
79. 1556 goto drop_and_release;
80. 1557 }
81. 1558 }
82. 1559 /* Kill the following clause, if you dislike this way. */
83. 1560 else if (!sysctl_tcp_syncookies && //SYN Cookie没有开启
84. 1561 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
85. 1562 (sysctl_max_syn_backlog >> 2)) && //仅仅一个socket的request socket的数量就占据了SYN backlog队列最大空间的3/4还多
86. 1563 !tcp_peer_is_proven(req, dst, false)) {//无法证明对端主机处于活动状态
87. ...
88. 1573 goto drop_and_release; //这时很有可能发生了SYN Flood攻击,需要丢弃
89. 1574 }
90. 1575
91. 1576 isn = tcp_v4_init_sequence(skb);//生成随机序列号
92. 1577 }
93. 1578 tcp_rsk(req)->snt_isn = isn;
94. ...
95. 1598 skb_synack = tcp_make_synack(sk, dst, req,
96. 1599 fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL); //构建一个SYN|ACK包
97. 1600
98. 1601 if (skb_synack) {
99. 1602 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);//计算检验和
100. 1603 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
101. 1604 } else
102. 1605 goto drop_and_free;
103. 1606
104. 1607 if (likely(!do_fastopen)) {
105. 1608 int err;
106. 1609 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
107. 1610 ireq->rmt_addr, ireq->opt);
108. 1611 err = net_xmit_eval(err); //检查返回值,如果不是“成功”或“拥塞通告”则丢弃此连接
109. 1612 if (err || want_cookie)
110. 1613 goto drop_and_free;
111. 1614
112. 1615 tcp_rsk(req)->snt_synack = tcp_time_stamp;
113. 1616 tcp_rsk(req)->listener = NULL;
114. 1617 /* Add the request_sock to the SYN table */
115. 1618 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
116. ...
117. 1633 return 0;
118. 1634 }
1490和1501:限制未处理完毕的连接请求的数量,以避免未建立的连接消耗的内存过大
1506:申请一个request_sock结构用于保存连接信息;使用request_sock而不使用socket的原因是前者比后者小得多,会大大减少内存的占用
1523:tcp_openreq_init函数用于保存SYN请求中的信息:
1084:rcv_nxt是期望接收的下一个报文的起始序列号,这里必须是SYN包的seq + 1;发送ACK报文时rcv_nxt会被用作确认号
回到tcp_v4_conn_request函数:
1. 1075 static inline void tcp_openreq_init(struct request_sock *req,
2. 1076 struct tcp_options_received *rx_opt,
3. 1077 struct sk_buff *skb)
4. 1078 {
5. 1079 struct inet_request_sock *ireq = inet_rsk(req);
6. 1080
7. 1081 req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
8. 1082 req->cookie_ts = 0;
9. 1083 tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
10. 1084 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
11. 1085 tcp_rsk(req)->snt_synack = 0;
12. 1086 req->mss = rx_opt->mss_clamp;
13. 1087 req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
14. 1088 ireq->tstamp_ok = rx_opt->tstamp_ok;
15. 1089 ireq->sack_ok = rx_opt->sack_ok;
16. 1090 ireq->snd_wscale = rx_opt->snd_wscale;
17. 1091 ireq->wscale_ok = rx_opt->wscale_ok;
18. 1092 ireq->acked = 0;
19. 1093 ireq->ecn_ok = 0;
20. 1094 ireq->rmt_port = tcp_hdr(skb)->source;
21. 1095 ireq->loc_port = tcp_hdr(skb)->dest;
22. 1096 }
1526-1529:将地址、选项等信息保存下来以便建立established连接时使用
1609-1610:填充SYN|ACK包的IP首部并发送;发送的SYN|ACK并没有保存到队列中,是为了防止发生SYN Flood攻击时耗费内存
1618:将这个request_sock加入到SYN hash表中并设置定时器,超时后会重发SYN|ACK或删除request_sock
下面来看看SYN|ACK的构建和发送的过程。SYN|ACK的构建使用的是tcp_make_synack:
代码解析:
1. 2654 struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2. 2655 struct request_sock *req,
3. 2656 struct tcp_fastopen_cookie *foc)
4. 2657 {
5. 2658 struct tcp_out_options opts;
6. 2659 struct inet_request_sock *ireq = inet_rsk(req);
7. 2660 struct tcp_sock *tp = tcp_sk(sk);
8. 2661 struct tcphdr *th;
9. 2662 struct sk_buff *skb;
10. 2663 struct tcp_md5sig_key *md5;
11. 2664 int tcp_header_size;
12. 2665 int mss;
13. 2666
14. 2667 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); //申请skb
15. 2668 if (unlikely(!skb)) {
16. 2669 dst_release(dst);
17. 2670 return NULL;
18. 2671 }
19. 2672 /* Reserve space for headers. */
20. 2673 skb_reserve(skb, MAX_TCP_HEADER);
21. 2674
22. 2675 skb_dst_set(skb, dst);
23. ...
24. 2682 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
25. 2683 __u8 rcv_wscale;
26. 2684 /* Set this up on the first call only */
27. 2685 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
28. 2686
29. 2687 /* limit the window selection if the user enforce a smaller rx buffer */
30. 2688 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
31. 2689 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
32. 2690 req->window_clamp = tcp_full_space(sk);
33. 2691
34. 2692 /* tcp_full_space because it is guaranteed to be the first packet */
35. 2693 tcp_select_initial_window(tcp_full_space(sk),
36. 2694 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
37. 2695 &req->rcv_wnd,
38. 2696 &req->window_clamp,
39. 2697 ireq->wscale_ok,
40. 2698 &rcv_wscale,
41. 2699 dst_metric(dst, RTAX_INITRWND));
42. 2700 ireq->rcv_wscale = rcv_wscale;
43. 2701 }
44. ...
45. 2710 tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5,
46. 2711 foc) + sizeof(*th);
47. 2712
48. 2713 skb_push(skb, tcp_header_size);
49. 2714 skb_reset_transport_header(skb);
50. 2715
51. 2716 th = tcp_hdr(skb);
52. 2717 memset(th, 0, sizeof(struct tcphdr));
53. 2718 th->syn = 1;
54. 2719 th->ack = 1;
55. 2720 TCP_ECN_make_synack(req, th);
56. 2721 th->source = ireq->loc_port;
57. 2722 th->dest = ireq->rmt_port;
58. ...
59. 2726 tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
60. 2727 TCPHDR_SYN | TCPHDR_ACK);
61. 2728
62. 2729 th->seq = htonl(TCP_SKB_CB(skb)->seq);
63. 2730 /* XXX data is queued and acked as is. No buffer/window check */
64. 2731 th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
65. 2732
66. 2733 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
67. 2734 th->window = htons(min(req->rcv_wnd, 65535U));
68. 2735 tcp_options_write((__be32 *)(th + 1), tp, &opts);
69. 2736 th->doff = (tcp_header_size >> 2);
70. ...
71. 2747 return skb;
72. 2748 }
2693-2700:初始化通告窗口大小
2710-2711:构建选项字段
2716-2736:设置TCP头的各个字段;其中2731行代码,SYN|ACK的ack_seq用tcp_openreq_init函数保存的rcv_nxt来赋值,这个值是SYN报文的seq + 1
这样一个SYN|ACK就构建完毕了,而包的其余部分的构建和发送是靠ip_build_and_send_pkt来完成:
server端TCP在收到SYN请求后进行的处理总结如下:
1. 129 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
2. 130 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
3. 131 {
4. 132 struct inet_sock *inet = inet_sk(sk);
5. 133 struct rtable *rt = skb_rtable(skb);
6. 134 struct iphdr *iph;
7. 135
8. 136 /* Build the IP header. */
9. 137 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
10. 138 skb_reset_network_header(skb);
11. 139 iph = ip_hdr(skb);
12. 140 iph->version = 4; //版本
13. 141 iph->ihl = 5; //首部长度
14. 142 iph->tos = inet->tos; //TOS: type of service
15. 143 if (ip_dont_fragment(sk, &rt->dst))
16. 144 iph->frag_off = htons(IP_DF); //不分片
17. 145 else
18. 146 iph->frag_off = 0;
19. 147 iph->ttl = ip_select_ttl(inet, &rt->dst); //TTL: time to live
20. 148 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
21. 149 iph->saddr = saddr;
22. 150 iph->protocol = sk->sk_protocol; //传输层协议号
23. 151 ip_select_ident(iph, &rt->dst, sk);
24. 152
25. 153 if (opt && opt->opt.optlen) {
26. 154 iph->ihl += opt->opt.optlen>>2;
27. 155 ip_options_build(skb, &opt->opt, daddr, rt, 0); //设置选项字段
28. 156 }
29. 157
30. 158 skb->priority = sk->sk_priority;
31. 159 skb->mark = sk->sk_mark;
32. 160
33. 161 /* Send it out. */
34. 162 return ip_local_out(skb);
35. 163 }
1、创建一个比较小的数据结构request_sock并保存连接信息
2、将request_sock加入到syn table中,以便ACK到来时能够找到相应的连接信息
3、创建SYN|ACK包并发送出去,同时设置重传定时器以避免SYN|ACK包丢失
发送完SYN|ACK后,server端就会等待接收client端处理完SYN|ACK后再发送的ACK报文。client端会如何处理SYN|ACK呢?server端又会如何处理ACK以完成三次握手流程呢?