内核中的TCP的追踪分析-15-TCP(IPV4)的客户端与服务器端socket连接过程-2
接上篇
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct iphdr *iph;
u32 len;
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
*/
if (skb->pkt_type == PACKET_OTHERHOST)
goto drop;
IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES);
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
goto out;
}
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
iph = ip_hdr(skb);
/*
* RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
*
* Is the datagram acceptable?
*
* 1. Length at least the size of an ip header
* 2. Version of 4
* 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
* 4. Doesn't have a bogus length
*/
if (iph->ihl 5 || iph->version != 4)
goto inhdr_error;
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
iph = ip_hdr(skb);
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto inhdr_error;
len = ntohs(iph->tot_len);
if (skb->len len) {
IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len (iph->ihl*4))
goto inhdr_error;
/* Our transport medium may have padded the buffer out. Now we know it
* is IP we can trim to the true length of the frame.
* Note this now means skb->len holds ntohs(iph->tot_len).
*/
if (pskb_trim_rcsum(skb, len)) {
IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
goto drop;
}
/* Remove any debris in the socket control block */
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL,
ip_rcv_finish);
inhdr_error:
IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out:
return NET_RX_DROP;
}
这个函数的前面大多是相关于ip协议的检查,包括ip头的检查,我们跳过这部分检查内容,函数后通过NF_HOOK宏转入ip_rcv_finish()函数
static int ip_rcv_finish(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
if (skb->dst == NULL) {
int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
skb->dev);
if (unlikely(err)) {
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
else if (err == -ENETUNREACH)
IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
goto drop;
}
}
#ifdef CONFIG_NET_CLS_ROUTE
if (unlikely(skb->dst->tclassid)) {
struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
u32 idx = skb->dst->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes+=skb->len;
st[(idx>>16)&0xFF].i_packets++;
st[(idx>>16)&0xFF].i_bytes+=skb->len;
}
#endif
if (iph->ihl > 5 && ip_rcv_options(skb))
goto drop;
rt = skb->rtable;
if (rt->rt_type == RTN_MULTICAST)
IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS);
else if (rt->rt_type == RTN_BROADCAST)
IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS);
return dst_input(skb);
drop:
kfree_skb(skb);
return NET_RX_DROP;
}
因为数据包是刚接收到的还没有设置关于dst_entry结构的路由信息,上面首先是设置数据包的路由,这是通过ip_route_input()函数来实现的
int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev)
{
struct rtable * rth;
unsigned hash;
int iif = dev->ifindex;
struct net *net;
net = dev_net(dev);
tos &= IPTOS_RT_MASK;
hash = rt_hash(daddr, saddr, iif);
rcu_read_lock();
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
rth = rcu_dereference(rth->u.dst.rt_next)) {
if (((rth->fl.fl4_dst ^ daddr) |
(rth->fl.fl4_src ^ saddr) |
(rth->fl.iif ^ iif) |
rth->fl.oif |
(rth->fl.fl4_tos ^ tos)) == 0 &&
rth->fl.mark == skb->mark &&
net_eq(dev_net(rth->u.dst.dev), net) &&
rth->rt_genid == atomic_read(&rt_genid)) {
dst_use(&rth->u.dst, jiffies);
RT_CACHE_STAT_INC(in_hit);
rcu_read_unlock();
skb->rtable = rth;
return 0;
}
RT_CACHE_STAT_INC(in_hlist_search);
}
rcu_read_unlock();
/* Multicast recognition logic is moved from route cache to here.
The problem was that too many Ethernet cards have broken/missing
hardware multicast filters :-( As result the host on multicasting
network acquires a lot of useless route cache entries, sort of
SDR messages from all the world. Now we try to get rid of them.
Really, provided software IP multicast filter is organized
reasonably (at least, hashed), it does not result in a slowdown
comparing with route cache reject entries.
Note, that multicast routers are not affected, because
route cache entry is created eventually.
*/
if (ipv4_is_multicast(daddr)) {
struct in_device *in_dev;
rcu_read_lock();
if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
int our = ip_check_mc(in_dev, daddr, saddr,
ip_hdr(skb)->protocol);
if (our
#ifdef CONFIG_IP_MROUTE
|| (!ipv4_is_local_multicast(daddr) &&
IN_DEV_MFORWARD(in_dev))
#endif
) {
rcu_read_unlock();
return ip_route_input_mc(skb, daddr, saddr,
tos, dev, our);
}
}
rcu_read_unlock();
return -EINVAL;
}
return ip_route_input_slow(skb, daddr, saddr, tos, dev);
}
函数中首先是在rt_hash_table[]系统中的路由缓存表中查找,如果找到了就把路由表赋给数据包使用skb->rtable = rth,如果没有找到就会进入ip_route_input_slow()函数“慢一些的查找”关于 ip_route_input_slow()函数我们以前看过了 ip_route_output_flow()的创建了
http://blog.chinaunix.net/u2/64681/showart_1408613.html
这里就不再详细分析了,我们要注意关键的一句
static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev)
{
。。。。。。
rth->u.dst.input= ip_local_deliver;
。。。。。。
err = rt_intern_hash(hash, rth, &skb->rtable);
。。。。。。
}
重点注意这里是将路由表中的进入路径钩子挂入了ip_local_deliver函数。然后将新建立的路由表设置进入数据包的路由表指针,并且放入全局的路由表缓存rt_hash_table[]中,我们继续看ip_rcv_finish()函数的代码,设置了数据包的路由表后,后进入dst_input()函数。
static inline int dst_input(struct sk_buff *skb)
{
int err;
for (;;) {
err = skb->dst->input(skb);
if (likely(err == 0))
return err;
/* Oh, Jamal... Seems, I will not forgive you this mess. :-) */
if (unlikely(err != NET_XMIT_BYPASS))
return err;
}
}
显然是进入了我们上面ip_route_input_slow()函数中ip_local_deliver函数
int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
首先是对到来的数据包进行碎片重组的检查和处理,后转入了ip_local_deliver_finish()函数
static int ip_local_deliver_finish(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
__skb_pull(skb, ip_hdrlen(skb));
/* Point into the IP datagram, just past the header. */
skb_reset_transport_header(skb);
rcu_read_lock();
{
int protocol = ip_hdr(skb)->protocol;
int hash, raw;
struct net_protocol *ipprot;
resubmit:
raw = raw_local_deliver(skb, protocol);
hash = protocol & (MAX_INET_PROTOS - 1);
ipprot = rcu_dereference(inet_protos[hash]);
if (ipprot != NULL && (net == &init_net || ipprot->netns_ok)) {
int ret;
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb(skb);
goto out;
}
nf_reset(skb);
}
ret = ipprot->handler(skb);
if (ret 0) {
protocol = -ret;
goto resubmit;
}
IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
} else {
if (!raw) {
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
} else
IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
kfree_skb(skb);
}
}
out:
rcu_read_unlock();
return 0;
}
函数中首先根据数据包头部的信息取出要使用的协议类型然后确定一个hash值,进一步从inet_protos[]指针数组中取得相应的具体协议的struct net_protocol结构指针
struct net_protocol {
int (*handler)(struct sk_buff *skb);
void (*err_handler)(struct sk_buff *skb, u32 info);
int (*gso_send_check)(struct sk_buff *skb);
struct sk_buff *(*gso_segment)(struct sk_buff *skb,
int features);
unsigned int no_policy:1,
netns_ok:1;
};
这个结构用于所有注册的协议使用。inet_protos数组是在初始化过程函数inet_init()中被设置的
static int __init inet_init(void)
{
。。。。。。
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) 0)
printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) 0)
printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) 0)
printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
。。。。。。
}
因为我们说的是ipv4所以相应的协议结构
static struct net_protocol tcp_protocol = {
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
.gso_send_check = tcp_v4_gso_send_check,
.gso_segment = tcp_tso_segment,
.no_policy = 1,
.netns_ok = 1,
};
所以在ip_local_deliver_finish()函数中ipprot结构变量会取得tcp_protocol协议结构,后执行
ret = ipprot->handler(skb);
这将进入 tcp_v4_rcv()这个钩子函数中
文章来源CU社区:内核中的TCP的追踪分析-15-TCP(IPV4)的客户端与服务器端socket连接过程-2
相关文章