内核中的TCP的追踪分析-17-TCP(IPV4)的客户端与服务器端socket连接过程-4
我们昨天跟踪到了tcp_v4_conn_request()函数,我们分段来看这个函数
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct inet_request_sock *ireq;
struct tcp_options_received tmp_opt;
struct request_sock *req;
__be32 saddr = ip_hdr(skb)->saddr;
__be32 daddr = ip_hdr(skb)->daddr;
__u32 isn = TCP_SKB_CB(skb)->when;
struct dst_entry *dst = NULL;
#ifdef CONFIG_SYN_COOKIES
int want_cookie = 0;
#else
#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
#endif
/* Never answer to SYNs send to broadcast or multicast */
if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
goto drop;
/* TW buckets are converted to open requests without
* limitations, they conserve resources and peer is
* evidently real one.
*/
if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
#ifdef CONFIG_SYN_COOKIES
if (sysctl_tcp_syncookies) {
want_cookie = 1;
} else
#endif
goto drop;
}
/* Accept backlog is full. If we have already queued enough
* of warm entries in syn queue, drop request. It is better than
* clogging syn queue with openreqs with exponentially increasing
* timeout.
*/
if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
req = inet_reqsk_alloc(&tcp_request_sock_ops);
if (!req)
goto drop;
#ifdef CONFIG_TCP_MD5SIG
tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
#endif
我们只围绕重要的核心部分,代码中的英文注释部分我们就不看了,我们从上面看到在上面的代码中分配了一个struct request_sock 结构变量req,这个结构在第六节
http://blog.chinaunix.net/u2/64681/showart.php?id=1404050
中讲到了这个结构是用来代表连接请求的。所以通过inet_reqsk_alloc()函数分配了一个连接请求来代表有客户端的连接到来。我们接着看代码
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = 536;
tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
tcp_parse_options(skb, &tmp_opt, 0);
if (want_cookie && !tmp_opt.saw_tstamp)
tcp_clear_options(&tmp_opt);
if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
/* Some OSes (unknown ones, but I see them on web server, which
* contains information interesting only for windows'
* users) do not send their stamp in SYN. It is easy case.
* We simply do not advertise TS support.
*/
tmp_opt.saw_tstamp = 0;
tmp_opt.tstamp_ok = 0;
}
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb);
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
ireq = inet_rsk(req);
ireq->loc_addr = daddr;
ireq->rmt_addr = saddr;
ireq->opt = tcp_v4_save_options(sk, skb);
在上面这部分的代码中有一个数据结构
struct tcp_options_received {
/* PAWS/RTTM data */
long ts_recent_stamp;/* Time we stored ts_recent (for aging) */
u32 ts_recent; /* Time stamp to echo next */
u32 rcv_tsval; /* Time stamp value */
u32 rcv_tsecr; /* Time stamp echo reply */
u16 saw_tstamp : 1, /* Saw TIMESTAMP on last packet */
tstamp_ok : 1, /* TIMESTAMP seen on SYN packet */
dsack : 1, /* D-SACK is scheduled */
wscale_ok : 1, /* Wscale seen on SYN packet */
sack_ok : 4, /* SACK seen on SYN packet */
snd_wscale : 4, /* Window scaling received from sender */
rcv_wscale : 4; /* Window scaling to send to receiver */
/* SACKs data */
u8 eff_sacks; /* Size of SACK array to send with next packet */
u8 num_sacks; /* Number of SACK blocks */
u16 user_mss; /* mss requested by user in ioctl */
u16 mss_clamp; /* Maximal mss, negotiated at connection setup */
};
这个结构用来记录客户端也就是发送方的一些tcp的信息,我们看到在上面的代码中首先是根据数据包来初始化设置struct tcp_options_received结构变量tmp_opt,关于其的操作我们就不细看了,这里还有一个代表INET的sock的请求连接结构体
struct inet_request_sock {
struct request_sock req;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
u16 inet6_rsk_offset;
/* 2 bytes hole, try to pack */
#endif
__be32 loc_addr;
__be32 rmt_addr;
__be16 rmt_port;
u16 snd_wscale : 4,
rcv_wscale : 4,
tstamp_ok : 1,
sack_ok : 1,
wscale_ok : 1,
ecn_ok : 1,
acked : 1;
struct ip_options *opt;
};
我们也把他列在上面了供将来分析使用,代码中也根据数据包对其进行了初始化设置,除了目标地址和来源地址的设置外,还对其内部的opt这个关于ip选项的结构进行了设置
ireq->opt = tcp_v4_save_options(sk, skb);
这是调用tcp_v4_save_options函数
static struct ip_options *tcp_v4_save_options(struct sock *sk,
struct sk_buff *skb)
{
struct ip_options *opt = &(IPCB(skb)->opt);
struct ip_options *dopt = NULL;
if (opt && opt->optlen) {
int opt_size = optlength(opt);
dopt = kmalloc(opt_size, GFP_ATOMIC);
if (dopt) {
if (ip_options_echo(dopt, skb)) {
kfree(dopt);
dopt = NULL;
}
}
}
return dopt;
}
这个函数是根据数据包中的ip_options结构内容来分配一个新的结构变量并通过ip_options_echo()进步的初始化。请朋友们自行阅读ip_options_echo()函数,因为他大多是一些初始化的操作我们就不跟进了。我们继续看tcp_v4_conn_request()函数的代码
if (!want_cookie)
TCP_ECN_create_request(req, tcp_hdr(skb));
if (want_cookie) {
#ifdef CONFIG_SYN_COOKIES
syn_flood_warning(skb);
req->cookie_ts = tmp_opt.tstamp_ok;
#endif
isn = cookie_v4_init_sequence(sk, skb, &req->mss);
} else if (!isn) {
struct inet_peer *peer = NULL;
/* VJ's idea. We save last timestamp seen
* from the destination in peer table, when entering
* state TIME-WAIT, and check against it before
* accepting new connection request.
*
* If "isn" is not zero, this request hit alive
* timewait bucket, so that all the necessary checks
* are made in the function processing timewait state.
*/
if (tmp_opt.saw_tstamp &&
tcp_death_row.sysctl_tw_recycle &&
(dst = inet_csk_route_req(sk, req)) != NULL &&
(peer = rt_get_peer((struct rtable *)dst)) != NULL &&
peer->v4daddr == saddr) {
if (get_seconds() peer->tcp_ts_stamp + TCP_PAWS_MSL &&
(s32)(peer->tcp_ts - req->ts_recent) >
TCP_PAWS_WINDOW) {
NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
goto drop_and_release;
}
}
/* Kill the following clause, if you dislike this way. */
else if (!sysctl_tcp_syncookies &&
(sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk)
(sysctl_max_syn_backlog >> 2)) &&
(!peer || !peer->tcp_ts_stamp) &&
(!dst || !dst_metric(dst, RTAX_RTT))) {
/* Without syncookies last quarter of
* backlog is filled with destinations,
* proven to be alive.
* It means that we continue to communicate
* to destinations, already remembered
* to the moment of synflood.
*/
LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
"request from " NIPQUAD_FMT "/%u\n",
NIPQUAD(saddr),
ntohs(tcp_hdr(skb)->source));
goto drop_and_release;
}
isn = tcp_v4_init_sequence(skb);
}
tcp_rsk(req)->snt_isn = isn;
if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
goto drop_and_free;
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
return 0;
drop_and_release:
dst_release(dst);
drop_and_free:
reqsk_free(req);
drop:
return 0;
}
关于CONFIG_SYN_COOKIES的选项不是我们的重点,所以到达代码的尾部,我们看到首先会向客户端回应一个答复ack,以告诉客户端服务器已经接收到了连接请求,这是通过__tcp_v4_send_synack()函数实现的,这也是协议上所称的“第二次握手”,也就代表我们客户端的连接请求到这里是“次握手”我们暂时把它放在下一节中分析。现在还是集中精力看接下来调用的inet_csk_reqsk_queue_hash_add()函数
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
unsigned long timeout)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
lopt->hash_rnd, lopt->nr_table_entries);
reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
inet_csk_reqsk_queue_added(sk, timeout);
}
函数中首先是调用reqsk_queue_hash_req()将我们这里的客户端请求结构req链入到了INET的专用接收连接队列下监听结构内的syn_table这个hash数组中。我们就不看这个函数代码了,接下来就要调用inet_csk_reqsk_queue_added()函数增加接收队列中的监听结构体struct listen_sock的相应的计数器并调整定时器。
static inline void inet_csk_reqsk_queue_added(struct sock *sk,
const unsigned long timeout)
{
if (reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue) == 0)
inet_csk_reset_keepalive_timer(sk, timeout);
}
然后函数就层层返回了。按照tcp协议中的介绍第三次握手又会来到,关于tcp的握手协议的详细内容请朋友们看
http://baike.baidu.com/view/1003841.htm
,第二次握手也就是服务器“答复”发送数据给客户端,我们下一节中讲解,这里假设客户端接受到服务器的ack后,再次发送握手的数据到来所以又会执行到上一节我们看到的
http://blog.chinaunix.net/u2/64681/showart.php?id=1656780
tcp_v4_do_rcv()函数中,我们上一节没有看tcp_v4_hnd_req()这个函数,在上一节我们没有看这个函数是因为他还没有创建请求request_sock并链入到队列中。
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
struct tcphdr *th = tcp_hdr(skb);
const struct iphdr *iph = ip_hdr(skb);
struct sock *nsk;
struct request_sock **prev;
/* Find possible connection requests. */
struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
iph->saddr, iph->daddr);
if (req)
return tcp_check_req(sk, skb, req, prev);
nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
th->source, iph->daddr, th->dest, inet_iif(skb));
if (nsk) {
if (nsk->sk_state != TCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
inet_twsk_put(inet_twsk(nsk));
return NULL;
}
#ifdef CONFIG_SYN_COOKIES
if (!th->rst && !th->syn && th->ack)
sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
#endif
return sk;
}
我们看到首先是调用inet_csk_search_req()来查找我们上面看到的刚刚挂入到接收队列中的请求结构体request_sock,接着进入tcp_check_req()函数,这个函数的英文注释非常多
struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
struct request_sock *req,
struct request_sock **prev)
{
const struct tcphdr *th = tcp_hdr(skb);
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
int paws_reject = 0;
struct tcp_options_received tmp_opt;
struct sock *child;
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
tcp_parse_options(skb, &tmp_opt, 0);
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = req->ts_recent;
/* We do not store true stamp, but it is not required,
* it can be estimated (approximately)
* from another data.
*/
tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)req->retrans);
paws_reject = tcp_paws_check(&tmp_opt, th->rst);
}
}
/* Check for pure retransmitted SYN. */
if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
flg == TCP_FLAG_SYN &&
!paws_reject) {
/*
* RFC793 draws (Incorrectly! It was fixed in RFC1122)
* this case on figure 6 and figure 8, but formal
* protocol description says NOTHING.
* To be more exact, it says that we should send ACK,
* because this segment (at least, if it has no data)
* is out of window.
*
* CONCLUSION: RFC793 (even with RFC1122) DOES NOT
* describe SYN-RECV state. All the description
* is wrong, we cannot believe to it and should
* rely only on common sense and implementation
* experience.
*
* Enforce "SYN-ACK" according to figure 8, figure 6
* of RFC793, fixed by RFC1122.
*/
req->rsk_ops->rtx_syn_ack(sk, req);
return NULL;
}
/* Further reproduces section "SEGMENT ARRIVES"
for state SYN-RECEIVED of RFC793.
It is broken, however, it does not work only
when SYNs are crossed.
You would think that SYN crossing is impossible here, since
we should have a SYN_SENT socket (from connect()) on our end,
but this is not true if the crossed SYNs were sent to both
ends by a malicious third party. We must defend against this,
and to do that we first verify the ACK (as per RFC793, page
36) and reset if it is invalid. Is this a true full defense?
To convince ourselves, let us consider a way in which the ACK
test can still pass in this 'malicious crossed SYNs' case.
Malicious sender sends identical SYNs (and thus identical sequence
numbers) to both A and B:
A: gets SYN, seq=7
B: gets SYN, seq=7
By our good fortune, both A and B select the same initial
send sequence number of seven :-)
A: sends SYN|ACK, seq=7, ack_seq=8
B: sends SYN|ACK, seq=7, ack_seq=8
So we are now A eating this SYN|ACK, ACK test passes. So
does sequence test, SYN is truncated, and thus we consider
it a bare ACK.
If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
bare ACK. Otherwise, we create an established connection. Both
ends (listening sockets) accept the new incoming connection and try
to talk to each other. 8-)
Note: This case is both harmless, and rare. Possibility is about the
same as us discovering intelligent life on another plant tomorrow.
But generally, we should (RFC lies!) to accept ACK
from SYNACK both here and in tcp_rcv_state_process().
tcp_rcv_state_process() does not, hence, we do not too.
Note that the case is absolutely generic:
we cannot optimize anything here without
violating protocol. All the checks must be made
before attempt to create socket.
*/
/* RFC793 page 36: "If the connection is in any non-synchronized state ...
* and the incoming segment acknowledges something not yet
* sent (the segment carries an unacceptable ACK) ...
* a reset is sent."
*
* Invalid ACK: reset will be sent by listening socket
*/
if ((flg & TCP_FLAG_ACK) &&
(TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
return sk;
/* Also, it would be not so bad idea to check rcv_tsecr, which
* is essentially ACK extension and too early or too late values
* should cause reset in unsynchronized states.
*/
/* RFC793: "first check sequence number". */
if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
/* Out of window: send ACK and drop. */
if (!(flg & TCP_FLAG_RST))
req->rsk_ops->send_ack(skb, req);
if (paws_reject)
NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
return NULL;
}
/* In sequence, PAWS is OK. */
if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
req->ts_recent = tmp_opt.rcv_tsval;
if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
/* Truncate SYN, it is out of window starting
at tcp_rsk(req)->rcv_isn + 1. */
flg &= ~TCP_FLAG_SYN;
}
/* RFC793: "second check the RST bit" and
* "fourth, check the SYN bit"
*/
if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
goto embryonic_reset;
}
/* ACK sequence verified above, just make sure ACK is
* set. If ACK not set, just silently drop the packet.
*/
if (!(flg & TCP_FLAG_ACK))
return NULL;
/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
inet_rsk(req)->acked = 1;
return NULL;
}
/* OK, ACK is valid, create big socket and
* feed this segment to it. It will repeat all
* the tests. THIS SEGMENT MUST MOVE SOCKET TO
* ESTABLISHED STATE. If it will be dropped after
* socket is created, wait for troubles.
*/
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb,
req, NULL);
if (child == NULL)
goto listen_overflow;
#ifdef CONFIG_TCP_MD5SIG
else {
/* Copy over the MD5 key from the original socket */
struct tcp_md5sig_key *key;
struct tcp_sock *tp = tcp_sk(sk);
key = tp->af_specific->md5_lookup(sk, child);
if (key != NULL) {
/*
* We're using one, so create a matching key on the
* newsk structure. If we fail to get memory then we
* end up not copying the key across. Shucks.
*/
char *newkey = kmemdup(key->key, key->keylen,
GFP_ATOMIC);
if (newkey) {
if (!tcp_alloc_md5sig_pool())
BUG();
tp->af_specific->md5_add(child, child,
newkey,
key->keylen);
}
}
}
#endif
inet_csk_reqsk_queue_unlink(sk, req, prev);
inet_csk_reqsk_queue_removed(sk, req);
inet_csk_reqsk_queue_add(sk, req, child);
return child;
listen_overflow:
if (!sysctl_tcp_abort_on_overflow) {
inet_rsk(req)->acked = 1;
return NULL;
}
embryonic_reset:
NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
if (!(flg & TCP_FLAG_RST))
req->rsk_ops->send_reset(sk, skb);
inet_csk_reqsk_queue_drop(sk, req, prev);
return NULL;
}
很多代码都是关于tcp 中的协议要求的,也包括 SYN同步序列编号(Synchronize Sequence Numbers)的检测,我们只关心与我们的主线密切的部分,其余部分的代码将来结合tcp协议,详细讲解,我们目标向上继续对接,代码尾部是将从接收队列中摘除我们前面创建的request_sock结构并修改接收队列中的监听计数器,然后就进入inet_csk_reqsk_queue_add()函数
static inline void inet_csk_reqsk_queue_add(struct sock *sk,
struct request_sock *req,
struct sock *child)
{
reqsk_queue_add(&inet_csk(sk)->icsk_accept_queue, req, sk, child);
}
static inline void reqsk_queue_add(struct request_sock_queue *queue,
struct request_sock *req,
struct sock *parent,
struct sock *child)
{
req->sk = child;
sk_acceptq_added(parent);
if (queue->rskq_accept_head == NULL)
queue->rskq_accept_head = req;
else
queue->rskq_accept_tail->dl_next = req;
queue->rskq_accept_tail = req;
req->dl_next = NULL;
}
非常明显将我们的request_sock代表客户端连接的数据挂入到了服务器的sock中的icsk_accept_queue接收队列中了,这里可以与我们在第7节
http://blog.chinaunix.net/u2/64681/showart.php?id=1404746
接收服务器连接的章节对接了,但是我们应该注意在这里将req中的sock挂入的是child,这个参数是从tcp_check_req()函数中传进来的
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb,
那么我们首先了解这个函数的过程,因为在服务器接收连接请求的时候调用reqsk_queue_get_child()函数会再次取得这个child指针。在那篇服务器连接的文章中我们也说了inet_sock()转换sock的问题,这个问题已经在前边的分析过程中解释了,也就是“滑动窗口”的协议。我们不重复了。我们这里关注一下这个child的创建过程,从
http://blog.chinaunix.net/u2/64681/showart.php?id=1656780
上一节中我们看到ipv4_specific结构中看到,这里是执行的钩子函数tcp_v4_syn_recv_sock()
.syn_recv_sock = tcp_v4_syn_recv_sock,
我们进入这个函数看一下
struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst)
{
struct inet_request_sock *ireq;
struct inet_sock *newinet;
struct tcp_sock *newtp;
struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *key;
#endif
if (sk_acceptq_is_full(sk))
goto exit_overflow;
if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
goto exit;
newsk = tcp_create_openreq_child(sk, req, skb);
if (!newsk)
goto exit;
newsk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(newsk, dst);
newtp = tcp_sk(newsk);
newinet = inet_sk(newsk);
ireq = inet_rsk(req);
newinet->daddr = ireq->rmt_addr;
newinet->rcv_saddr = ireq->loc_addr;
newinet->saddr = ireq->loc_addr;
newinet->opt = ireq->opt;
ireq->opt = NULL;
newinet->mc_index = inet_iif(skb);
newinet->mc_ttl = ip_hdr(skb)->ttl;
inet_csk(newsk)->icsk_ext_hdr_len = 0;
if (newinet->opt)
inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
newinet->id = newtp->write_seq ^ jiffies;
tcp_mtup_init(newsk);
tcp_sync_mss(newsk, dst_mtu(dst));
newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
tcp_initialize_rcv_mss(newsk);
#ifdef CONFIG_TCP_MD5SIG
/* Copy over the MD5 key from the original socket */
if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
/*
* We're using one, so create a matching key
* on the newsk structure. If we fail to get
* memory, then we end up not copying the key
* across. Shucks.
*/
char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
if (newkey != NULL)
tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
newkey, key->keylen);
}
#endif
__inet_hash_nolisten(newsk);
__inet_inherit_port(sk, newsk);
return newsk;
exit_overflow:
NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
exit:
NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
dst_release(dst);
return NULL;
}
很明显在这里创建了一个新的sock结构,初始设置后返回给上面的 child指针。这个函数中调用了tcp_create_openreq_child()函数
struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
{
struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
if (newsk != NULL) {
const struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_request_sock *treq = tcp_rsk(req);
struct inet_connection_sock *newicsk = inet_csk(newsk);
struct tcp_sock *newtp;
/* Now setup tcp_sock */
newtp = tcp_sk(newsk);
newtp->pred_flags = 0;
newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;
newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1;
tcp_prequeue_init(newtp);
tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
newtp->srtt = 0;
newtp->mdev = TCP_TIMEOUT_INIT;
newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0;
newtp->retrans_out = 0;
newtp->sacked_out = 0;
newtp->fackets_out = 0;
newtp->snd_ssthresh = 0x7fffffff;
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
* algorithms that we must have the following bandaid to talk
* efficiently to them. -DaveM
*/
newtp->snd_cwnd = 2;
newtp->snd_cwnd_cnt = 0;
newtp->bytes_acked = 0;
newtp->frto_counter = 0;
newtp->frto_highmark = 0;
newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
tcp_set_ca_state(newsk, TCP_CA_Open);
tcp_init_xmit_timers(newsk);
skb_queue_head_init(&newtp->out_of_order_queue);
newtp->write_seq = treq->snt_isn + 1;
newtp->pushed_seq = newtp->write_seq;
newtp->rx_opt.saw_tstamp = 0;
newtp->rx_opt.dsack = 0;
newtp->rx_opt.eff_sacks = 0;
newtp->rx_opt.num_sacks = 0;
newtp->urg_data = 0;
if (sock_flag(newsk, SOCK_KEEPOPEN))
inet_csk_reset_keepalive_timer(newsk,
keepalive_time_when(newtp));
newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
if (sysctl_tcp_fack)
tcp_enable_fack(newtp);
}
newtp->window_clamp = req->window_clamp;
newtp->rcv_ssthresh = req->rcv_wnd;
newtp->rcv_wnd = req->rcv_wnd;
newtp->rx_opt.wscale_ok = ireq->wscale_ok;
if (newtp->rx_opt.wscale_ok) {
newtp->rx_opt.snd_wscale = ireq->snd_wscale;
newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
} else {
newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
newtp->window_clamp = min(newtp->window_clamp, 65535U);
}
newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window)
newtp->rx_opt.snd_wscale);
newtp->max_window = newtp->snd_wnd;
if (newtp->rx_opt.tstamp_ok) {
newtp->rx_opt.ts_recent = req->ts_recent;
newtp->rx_opt.ts_recent_stamp = get_seconds();
newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
} else {
newtp->rx_opt.ts_recent_stamp = 0;
newtp->tcp_header_len = sizeof(struct tcphdr);
}
#ifdef CONFIG_TCP_MD5SIG
newtp->md5sig_info = NULL; /*XXX*/
if (newtp->af_specific->md5_lookup(sk, newsk))
newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
#endif
if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
newtp->rx_opt.mss_clamp = req->mss;
TCP_ECN_openreq_child(newtp, req);
TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
}
return newsk;
}
这个函数克隆了一个新的sock结构并将其的状态设置为了TCP_SYN_RECV,还会根据我们的requst_sock来设置,一系列的初始化设置后,tcp_v4_syn_recv_sock()函数为我们的服务器端创建了一个代表客户端的sock结构。具体的过程我们不详细叙述了,有兴趣的朋友可以自己读一下。
文章来源CU社区:内核中的TCP的追踪分析-17-TCP(IPV4)的客户端与服务器端socket连接过程-4
相关文章