内核中的TCP的追踪分析－6-TCP（IPV4)的socket的监听

2020-05-25 00:00:00 函数代码变量结构监听

我们来看一下这个结构中的listen钩子
const struct proto_ops inet_stream_ops = {
。。。。。。
.listen = inet_listen,
。。。。。。
};
很明显，挂入的是inet_listen函数，这个函数在/net/ipv4/af_inet.c中的194行处
int inet_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
unsigned char old_state;
int err;
lock_sock(sk);
err = -EINVAL;
if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
goto out;
old_state = sk->sk_state;
if (!((1 old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
goto out;
/* Really, if the socket is already in listen state
* we can only allow the backlog to be adjusted.
*/
if (old_state != TCP_LISTEN) {
err = inet_csk_listen_start(sk, backlog);
if (err)
goto out;
}
sk->sk_max_ack_backlog = backlog;
err = 0;
out:
release_sock(sk);
return err;
}
函数中首先是对socket的状态进行检测，然后如果没有处于监听状态则进入inet_csk_listen_start（）函数中
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{
struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
if (rc != 0)
return rc;
sk->sk_max_ack_backlog = 0;
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
/* There is race window here: we announce ourselves listening,
* but this transition is still not validated by get_port().
* It is OK, because this socket enters to hash table only
* after validation is complete.
*/
sk->sk_state = TCP_LISTEN;
if (!sk->sk_prot->get_port(sk, inet->num)) {
inet->sport = htons(inet->num);
sk_dst_reset(sk);
sk->sk_prot->hash(sk);
return 0;
}
sk->sk_state = TCP_CLOSE;
__reqsk_queue_destroy(&icsk->icsk_accept_queue);
return -EADDRINUSE;
}
这个函数在/net/ipv4/inet_connection_sock.c中的562行处，我们来看一下，首先是通过inet_sk（）将我们的socket转换成TCP的socket结构，并用inet指针指向他，接着用inet_csk（）将socket转换成struct inet_connection_sock结构指针并用icsk指向。这个结构是专门用来连接用的结构
struct inet_connection_sock {
/* inet_sock has to be the first member! */
struct inet_sock icsk_inet;
struct request_sock_queue icsk_accept_queue;
struct inet_bind_bucket *icsk_bind_hash;
unsigned long icsk_timeout;
struct timer_list icsk_retransmit_timer;
struct timer_list icsk_delack_timer;
__u32 icsk_rto;
__u32 icsk_pmtu_cookie;
const struct tcp_congestion_ops *icsk_ca_ops;
const struct inet_connection_sock_af_ops *icsk_af_ops;
unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
__u8 icsk_ca_state;
__u8 icsk_retransmits;
__u8 icsk_pending;
__u8 icsk_backoff;
__u8 icsk_syn_retries;
__u8 icsk_probes_out;
__u16 icsk_ext_hdr_len;
struct {
__u8 pending; /* ACK is pending */
__u8 quick; /* Scheduled number of quick acks */
__u8 pingpong; /* The session is interactive */
__u8 blocked; /* Delayed ACK was blocked by socket lock */
__u32 ato; /* Predicted tick of soft clock */
unsigned long timeout; /* Currently scheduled timeout */
__u32 lrcvtime; /* timestamp of last received data packet */
__u16 last_seg_size; /* Size of last incoming segment */
__u16 rcv_mss; /* MSS used for delayed ACK decisions */
} icsk_ack;
struct {
int enabled;
/* Range of MTUs to search */
int search_high;
int search_low;
/* Information on the current probe. */
int probe_size;
} icsk_mtup;
u32 icsk_ca_priv[16];
#define ICSK_CA_PRIV_SIZE (16 * sizeof(u32))
};
还是我们那句话“先混个面熟”，用时再说具体的变量作用，因为我们下面要用到这个结构所以这里全部贴出。我们再看inet_csk_listen_start（）函数，这里需要我们看一下在以前练习中提到的程序有一句代码：
listen(server_sockfd, 5);
这二个参数我们不用多介绍了，nr_table_entries就是传递过来的数值5，所以我们在代码中阅读一定要注意，这个数值是用于总共允许多少个客户端的socket连接数目，如果超过了这个数目客户端的socket只好睡眠等待了，这些过程我们在unix的socket那些章节详述论述了。我们看到reqsk_queue_alloc（）函数，这个函数就是为了保证我们上面所述的功能
int reqsk_queue_alloc(struct request_sock_queue *queue,
unsigned int nr_table_entries)
{
size_t lopt_size = sizeof(struct listen_sock);
struct listen_sock *lopt;
nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
nr_table_entries = max_t(u32, nr_table_entries, 8);
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
lopt_size += nr_table_entries * sizeof(struct request_sock *);
if (lopt_size > PAGE_SIZE)
lopt = __vmalloc(lopt_size,
GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
PAGE_KERNEL);
else
lopt = kzalloc(lopt_size, GFP_KERNEL);
if (lopt == NULL)
return -ENOMEM;
for (lopt->max_qlen_log = 3;
(1 lopt->max_qlen_log) nr_table_entries;
lopt->max_qlen_log++);
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
rwlock_init(&queue->syn_wait_lock);
queue->rskq_accept_head = NULL;
lopt->nr_table_entries = nr_table_entries;
write_lock_bh(&queue->syn_wait_lock);
queue->listen_opt = lopt;
write_unlock_bh(&queue->syn_wait_lock);
return 0;
}
我是无名小卒，转载请注明出处
http://qinjiana0786.cublog.cn

我们在这里对比一下unix的socket的监听，那里非常的简单只是检查一下连接数并设置一下相应的状态为监听状态就完成了。而TCP的监听过程相对比较复杂了。我们看到上面的函数中又出现了一种新的结构struct listen_sock，这个结构是专用于监听状态作用
struct listen_sock {
u8 max_qlen_log;
/* 3 bytes hole, try to use */
int qlen;
int qlen_young;
int clock_hand;
u32 hash_rnd;
u32 nr_table_entries;
struct request_sock *syn_table[0];
};
代码中的求小宏min_t（）引用了 int sysctl_max_syn_backlog = 256;非常容易理解后面的求大值，接着看到roundup_pow_of_two宏，在/include/linux/log2.h文件中
#define roundup_pow_of_two(n) \
( \
__builtin_constant_p(n) ? ( \
(n == 1) ? 1 : \
(1UL (ilog2((n) - 1) + 1)) \
) : \
__roundup_pow_of_two(n) \
)
#define roundup_pow_of_two(n) \
( \
__builtin_constant_p(n) ? ( \
(n == 1) ? 1 : \
(1UL (ilog2((n) - 1) + 1)) \
) : \
__roundup_pow_of_two(n) \
)
static inline __attribute__((const))
unsigned long __roundup_pow_of_two(unsigned long n)
{
return 1UL fls_long(n - 1);
}
其中__builtin_constant_p(n)是gcc编译器检查n是否为常数，我们介绍一下__roundup_pow_of_two函数，这个函数是主要作用是判断long类型的n是32位还是64位并向左移动想当于求2的次幂，而fls_long()代码纯粹是判断是32位和64位了
static inline unsigned fls_long(unsigned long l)
{
if (sizeof(l) == 4)
return fls(l);
return fls64(l);
}
我们经过大值的max_t已经经过大值后变成了8，所以这里会进入
(1UL
#define ilog2(n) \
( \
__builtin_constant_p(n) ? ( \
(n) 1 ? ____ilog2_NaN() : \
(n) & (1ULL 63) ? 63 : \
(n) & (1ULL 62) ? 62 : \
(n) & (1ULL 61) ? 61 : \
(n) & (1ULL 60) ? 60 : \
(n) & (1ULL 59) ? 59 : \
(n) & (1ULL 58) ? 58 : \
(n) & (1ULL 57) ? 57 : \
(n) & (1ULL 56) ? 56 : \
(n) & (1ULL 55) ? 55 : \
(n) & (1ULL 54) ? 54 : \
(n) & (1ULL 53) ? 53 : \
(n) & (1ULL 52) ? 52 : \
(n) & (1ULL 51) ? 51 : \
(n) & (1ULL 50) ? 50 : \
(n) & (1ULL 49) ? 49 : \
(n) & (1ULL 48) ? 48 : \
(n) & (1ULL 47) ? 47 : \
(n) & (1ULL 46) ? 46 : \
(n) & (1ULL 45) ? 45 : \
(n) & (1ULL 44) ? 44 : \
(n) & (1ULL 43) ? 43 : \
(n) & (1ULL 42) ? 42 : \
(n) & (1ULL 41) ? 41 : \
(n) & (1ULL 40) ? 40 : \
(n) & (1ULL 39) ? 39 : \
(n) & (1ULL 38) ? 38 : \
(n) & (1ULL 37) ? 37 : \
(n) & (1ULL 36) ? 36 : \
(n) & (1ULL 35) ? 35 : \
(n) & (1ULL 34) ? 34 : \
(n) & (1ULL 33) ? 33 : \
(n) & (1ULL 32) ? 32 : \
(n) & (1ULL 31) ? 31 : \
(n) & (1ULL 30) ? 30 : \
(n) & (1ULL 29) ? 29 : \
(n) & (1ULL 28) ? 28 : \
(n) & (1ULL 27) ? 27 : \
(n) & (1ULL 26) ? 26 : \
(n) & (1ULL 25) ? 25 : \
(n) & (1ULL 24) ? 24 : \
(n) & (1ULL 23) ? 23 : \
(n) & (1ULL 22) ? 22 : \
(n) & (1ULL 21) ? 21 : \
(n) & (1ULL 20) ? 20 : \
(n) & (1ULL 19) ? 19 : \
(n) & (1ULL 18) ? 18 : \
(n) & (1ULL 17) ? 17 : \
(n) & (1ULL 16) ? 16 : \
(n) & (1ULL 15) ? 15 : \
(n) & (1ULL 14) ? 14 : \
(n) & (1ULL 13) ? 13 : \
(n) & (1ULL 12) ? 12 : \
(n) & (1ULL 11) ? 11 : \
(n) & (1ULL 10) ? 10 : \
(n) & (1ULL 9) ? 9 : \
(n) & (1ULL 8) ? 8 : \
(n) & (1ULL 7) ? 7 : \
(n) & (1ULL 6) ? 6 : \
(n) & (1ULL 5) ? 5 : \
(n) & (1ULL 4) ? 4 : \
(n) & (1ULL 3) ? 3 : \
(n) & (1ULL 2) ? 2 : \
(n) & (1ULL 1) ? 1 : \
(n) & (1ULL 0) ? 0 : \
____ilog2_NaN() \
) : \
(sizeof(n) = 4) ? \
__ilog2_u32(n) : \
__ilog2_u64(n) \
)
后我们的连接个数被确定为了8。接下来我们看到了一个新的数据结构struct request_sock是用来代表socket连接请求用的数据结构
struct request_sock {
struct request_sock *dl_next; /* Must be first member! */
u16 mss;
u8 retrans;
u8 cookie_ts; /* syncookie: encode tcpopts in timestamp */
/* The following two fields can be easily recomputed I think -AK */
u32 window_clamp; /* window clamp at creation time */
u32 rcv_wnd; /* rcv_wnd offered first time */
u32 ts_recent;
unsigned long expires;
const struct request_sock_ops *rsk_ops;
struct sock *sk;
u32 secid;
u32 peer_secid;
};
这个结构大小与我们的连接数确定了我们要在通用的高速缓存中分配内存给struct listen_sock 结构变量指针lopt，我们看到分配成功后将lopt的nr_table_entries 连接数设置为我们上面计算得到的数值8，然后将inet_csk_listen_start函数中的inet_connection_sock结构变量icsk中的icsk_accept_queue与这里新分配的listen_sock建起关联，icsk_accept_queue是一个struct request_sock_queue结构
struct request_sock_queue {
struct request_sock *rskq_accept_head;
struct request_sock *rskq_accept_tail;
rwlock_t syn_wait_lock;
u8 rskq_defer_accept;
/* 3 bytes hole, try to pack */
struct listen_sock *listen_opt;
};
这个结构是专门用于请求连接的socket所使用的队列结构。我们看到在代码中
queue->listen_opt = lopt;
这句代码将icsk中的icsk_accept_queue->listen_opt与这里的lopt挂上钩了。我们再回到inet_csk_listen_start（）函数中，继续往下看，接着看到调用了inet_csk_delack_init（）函数将TCP的sock结构中转化为inet_connection_sock结构指针然后初始化他内部的结构变量icsk_ack为0。
struct {
__u8 pending; /* ACK is pending */
__u8 quick; /* Scheduled number of quick acks */
__u8 pingpong; /* The session is interactive */
__u8 blocked; /* Delayed ACK was blocked by socket lock */
__u32 ato; /* Predicted tick of soft clock */
unsigned long timeout; /* Currently scheduled timeout */
__u32 lrcvtime; /* timestamp of last received data packet */
__u16 last_seg_size; /* Size of last incoming segment */
__u16 rcv_mss; /* MSS used for delayed ACK decisions */
} icsk_ack;
这个结构变量是为了连接中的“应答”使用的。我是无名小卒，本文系原创难免有误，欢迎朋友们批评指正。然后我们看到
sk->sk_state = TCP_LISTEN;
将TCP的socket状态设置为了TCP_LISTEN。我们接下来看到是对端口的操作，这部分内容已经在我的博客文章
　中详细讲到了，这里就不再论述了。此后会进入sk->sk_prot->hash(sk)代码处执行
　那里的struct proto tcp_prot结构变量可以看到
.hash = inet_hash
很显然是执行的钩子函数inet_hash这个函数在/net/ipv4/inet_hashtables.c中的379行处

void inet_hash(struct sock *sk)
{
if (sk->sk_state != TCP_CLOSE) {
local_bh_disable();
__inet_hash(sk);
local_bh_enable();
}
}
接着又进入
static void __inet_hash(struct sock *sk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct hlist_head *list;
rwlock_t *lock;
if (sk->sk_state != TCP_LISTEN) {
__inet_hash_nolisten(sk);
return;
}
BUG_TRAP(sk_unhashed(sk));
list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
lock = &hashinfo->lhash_lock;
inet_listen_wlock(hashinfo);
__sk_add_node(sk, list);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
write_unlock(lock);
wake_up(&hashinfo->lhash_wait);
}
这函数主要是将sock挂入与已经初始化的TCP的hash表中，关于sk->sk_prot->h.hashinfo的TCP的hash表的初始化请看我在TCP的socket地址绑定中的分析部分
在那里是将tcp_hashinfo挂入的
struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
.lhash_lock = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
.lhash_users = ATOMIC_INIT(0),
.lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
};
这个结构是inet_hashinfo数据结构类型，所以在其内部有一个专用于listen的hash队列
struct hlist_head listening_hash[INET_LHTABLE_SIZE];
这里取得hash链头后，通过__sk_add_node（）将sock挂入到hash队列中。接着根据全局的网络空间结构变量init_net取得当前cpu的结构信息中的关于协议的使用计数，对其加1操作。后唤醒在hash表队列中等待进程，这是通过wake_up(&hashinfo->lhash_wait)来实现的。我们以前看到过unix的唤醒过程，这里就不看了。

文章来源CU社区：内核中的TCP的追踪分析－6-TCP（IPV4)的socket的监听

相关文章