TCP握手分为三个阶段,在握手开始之前,通信双方的套接字状态均为“TCP_CLOSE”,以下是这三个阶段:
(1)客户端发送一个标志位中SYN位为1的报文给服务端,并设套接字状态为“TCP_SYNSENT”
(2)服务端接到SYN报文,设套接字状态为“TCP_SYNRCV”,并回送一个SYN+ACK位均为1的报文
(3)客户端接到SYN+ACK报文,回送一个ACK位为1的报文,设套接字状态为“TCP_ESTABLISHED”,服务端接到ACK报文后,同样设置为“TCP_ESTABLISHED”
第一阶段
第一阶段客户端通过调用connect函数完成,connect实际上调用了内核中的__sys_connect函数。
以下代码是有关__sys_connect函数在文件net/scoket.c中的系统调用定义,由此可以看出,__sys_connect函数就是connect在内核中的实现。
SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
int, addrlen)
{
return __sys_connect(fd, uservaddr, addrlen);
}
从__sys_connect函数开始进入三次握手的第一阶段,以下是部分代码:
int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen)
{
...
sock = sockfd_lookup_light(fd, &err, &fput_needed);
...
err = move_addr_to_kernel(uservaddr, addrlen, &address);
...
err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,
sock->file->f_flags);
...
}
代码中的sock->ops->connect即是tcp_v4_connect函数,现在转到tcp_v4_connect函数:
1 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
2 {
3 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
4 struct inet_sock *inet = inet_sk(sk);
5 struct tcp_sock *tp = tcp_sk(sk);
6 __be16 orig_sport, orig_dport;
7 __be32 daddr, nexthop;
8 struct flowi4 *fl4;
9 struct rtable *rt;
10 int err;
11 struct ip_options_rcu *inet_opt;
12 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
13
14 if (addr_len < sizeof(struct sockaddr_in))
15 return -EINVAL;
16
17 if (usin->sin_family != AF_INET)
18 return -EAFNOSUPPORT;
19
20 nexthop = daddr = usin->sin_addr.s_addr;
21 inet_opt = rcu_dereference_protected(inet->inet_opt,
22 lockdep_sock_is_held(sk));
23 if (inet_opt && inet_opt->opt.srr) {
24 if (!daddr)
25 return -EINVAL;
26 nexthop = inet_opt->opt.faddr;
27 }
28
29 orig_sport = inet->inet_sport;
30 orig_dport = usin->sin_port;
31 fl4 = &inet->cork.fl.u.ip4;
32 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
33 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
34 IPPROTO_TCP,
35 orig_sport, orig_dport, sk);
36 if (IS_ERR(rt)) {
37 err = PTR_ERR(rt);
38 if (err == -ENETUNREACH)
39 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
40 return err;
41 }
42
43 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
44 ip_rt_put(rt);
45 return -ENETUNREACH;
46 }
47
48 if (!inet_opt || !inet_opt->opt.srr)
49 daddr = fl4->daddr;
50
51 if (!inet->inet_saddr)
52 inet->inet_saddr = fl4->saddr;
53 sk_rcv_saddr_set(sk, inet->inet_saddr);
54
55 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
56 /* Reset inherited state */
57 tp->rx_opt.ts_recent = 0;
58 tp->rx_opt.ts_recent_stamp = 0;
59 if (likely(!tp->repair))
60 tp->write_seq = 0;
61 }
62
63 inet->inet_dport = usin->sin_port;
64 sk_daddr_set(sk, daddr);
65
66 inet_csk(sk)->icsk_ext_hdr_len = 0;
67 if (inet_opt)
68 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
69
70 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
71
72 /* Socket identity is still unknown (sport may be zero).
73 * However we set state to SYN-SENT and not releasing socket
74 * lock select source port, enter ourselves into the hash tables and
75 * complete initialization after this.
76 */
77 tcp_set_state(sk, TCP_SYN_SENT);
78 err = inet_hash_connect(tcp_death_row, sk);
79 if (err)
80 goto failure;
81
82 sk_set_txhash(sk);
83
84 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
85 inet->inet_sport, inet->inet_dport, sk);
86 if (IS_ERR(rt)) {
87 err = PTR_ERR(rt);
88 rt = NULL;
89 goto failure;
90 }
91 /* OK, now commit destination to socket. */
92 sk->sk_gso_type = SKB_GSO_TCPV4;
93 sk_setup_caps(sk, &rt->dst);
94 rt = NULL;
95
96 if (likely(!tp->repair)) {
97 if (!tp->write_seq)
98 tp->write_seq = secure_tcp_seq(inet->inet_saddr,
99 inet->inet_daddr,
100 inet->inet_sport,
101 usin->sin_port);
102 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
103 inet->inet_saddr,
104 inet->inet_daddr);
105 }
106
107 inet->inet_id = tp->write_seq ^ jiffies;
108
109 if (tcp_fastopen_defer_connect(sk, &err))
110 return err;
111 if (err)
112 goto failure;
113
114 err = tcp_connect(sk);
115
116 if (err)
117 goto failure;
118
119 return 0;
120
121 failure:
122 /*
123 * This unhashes the socket and releases the local port,
124 * if necessary.
125 */
126 tcp_set_state(sk, TCP_CLOSE);
127 ip_rt_put(rt);
128 sk->sk_route_caps = 0;
129 inet->inet_dport = 0;
130 return err;
131 }
在tcp_v4_connect函数中为套接字填充一些变量,将套接字的状态修改为“TCP_SYNSENT”,然后进入tcp_connect函数。
1 int tcp_connect(struct sock *sk)
2 {
3 struct tcp_sock *tp = tcp_sk(sk);
4 struct sk_buff *buff;
5 int err;
6
7 tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
8
9 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
10 return -EHOSTUNREACH; /* Routing failure or similar. */
11
12 tcp_connect_init(sk);
13
14 if (unlikely(tp->repair)) {
15 tcp_finish_connect(sk, NULL);
16 return 0;
17 }
18
19 buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
20 if (unlikely(!buff))
21 return -ENOBUFS;
22
23 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
24 tcp_mstamp_refresh(tp);
25 tp->retrans_stamp = tcp_time_stamp(tp);
26 tcp_connect_queue_skb(sk, buff);
27 tcp_ecn_send_syn(sk, buff);
28 tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
29
30 /* Send off SYN; include data in Fast Open. */
31 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
32 tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
33
39 ...47
48 /* Timer for repeating the SYN until an answer. */
49 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
50 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
51 return 0;
52 }
通过调用tcp_transmit_skb函数构造SYN报文并发送出去,并设立一个定时器。
这一阶段函数的调用栈:
__sys_connect -> inet_stream_connect -> __inet_stream_connect -> tcp_v4_connect -> tcp_connect -> tcp_transmit_skb
第二阶段
这一阶段从中通过tcp_v4_rcv函数从ip层接收数据开始,以下是tcp_v4_rcv的部分代码:
1 int tcp_v4_rcv(struct sk_buff *skb)
2 {
3 ...
4
5 if (sk->sk_state == TCP_LISTEN) {
6 ret = tcp_v4_do_rcv(sk, skb);
7 goto put_and_return;
8 }
9
10 ...
11
12 put_and_return:
13 if (refcounted)
14 sock_put(sk);
15
16 return ret;
17 ...
18 }
由于当前套接字状态为“TCP_LISTEN”,进入tcp_v4_do_rcv函数执行
1 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
2 {
3 ...
4
5 if (sk->sk_state == TCP_LISTEN) {
6
7 if (tcp_rcv_state_process(sk, skb)) {
8 rsk = sk;
9 goto reset;
10 }
11 return 0;
12 ...
13 }
tcp_rcv_state_process函数专门用来处理套接字状态的转换,先贴出一张状态转换图:
1 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
2 {
3 ...
4
5 switch (sk->sk_state) {
6 case TCP_LISTEN:
7 if (th->ack)
8 return 1;
9
10 if (th->rst)
11 goto discard;
12
13 if (th->syn) {
14 if (th->fin)
15 goto discard;
16 /* It is possible that we process SYN packets from backlog,
17 * so we need to make sure to disable BH and RCU right there.
18 */
19 rcu_read_lock();
20 local_bh_disable();
21 acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
22 local_bh_enable();
23 rcu_read_unlock();
24
25 if (!acceptable)
26 return 1;
27 consume_skb(skb);
28 return 0;
29 }
30 goto discard;
31 ...
32 }
这是tcp_rcv_state_process在“TCP_LISTEN”阶段执行的代码,核心在于22行的icsk->icsk_af_ops->conn_request,在此处一路执行tcp_v4_conn_request, tcp_conn_request。
以下是tcp_conn_request的部分代码:
1 if (fastopen_sk) {
2 af_ops->send_synack(fastopen_sk, dst, &fl, req,
3 &foc, TCP_SYNACK_FASTOPEN);
4 /* Add the child socket directly into the accept queue */
5 inet_csk_reqsk_queue_add(sk, req, fastopen_sk);
6 sk->sk_data_ready(sk);
7 bh_unlock_sock(fastopen_sk);
8 sock_put(fastopen_sk);
9 } else {
10 tcp_rsk(req)->tfo_listener = false;
11 if (!want_cookie)
12 inet_csk_reqsk_queue_hash_add(sk, req,
13 tcp_timeout_init((struct sock *)req));
14 af_ops->send_synack(sk, dst, &fl, req, &foc,
15 !want_cookie ? TCP_SYNACK_NORMAL :
16 TCP_SYNACK_COOKIE);
17 if (want_cookie) {
18 reqsk_free(req);
19 return 0;
20 }
21 }
主要执行了send_synack函数,send_synack函数用于将SYN+ACK报文发送出去。
这一阶段函数的调用栈:
tcp_v4_rcv -> tcp_v4_do_rcv -> tcp_rcv_state_process -> tcp_v4_conn_request -> tcp_conn_request -> tcp_v4_send_synack
第三阶段
同上一阶段一样,从ip接收到报文后一路执行tcp_v4_rcv, tcp_v4_do_rcv,进入tcp_rcv_state_process函数:
1 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
2 const struct tcphdr *th, unsigned int len)
3 {
4 ...
5 switch (sk->sk_state) {
6 case TCP_SYN_SENT:
7 //进入到synack报文的处理流程
8 queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
9 if (queued >= 0)
10 return queued;
11
12 /* Do step6 onward by hand. */
13 tcp_urg(sk, skb, th);
14 __kfree_skb(skb);
15 tcp_data_snd_check(sk);
16 return 0;
17 }
18 ...
19 }
在tcp_rcv_synsent_state_process函数中又调用了tcp_finish_connect函数,tcp_finish_connect函数做了三件事:
(1)将套接字状态设置为"TCP_ESTABLISHED"
(2)调用tcp_send_ack函数发送一个ACK包
(3)初始化一些参数
tcp_send_ack函数又调用tcp_transmit_skb将ACK报文从网络上发出去。
最后是服务端接收到ACK报文,依次执行tcp_v4_rcv,tcp_v4_do_rcv,tcp_rcv_state_process函数,将套接字的状态设置为"TCP_ESTABLISHED",至此,三次握手过程结束。
这一阶段函数的调用栈:
tcp_v4_rcv -> tcp_v4_do_rcv -> tcp_rcv_synsent_state_process -> tcp_send_ack -> tcp_transmit_skb
tcp_v4_rcv -> tcp_rcv_state_process