作者:[email protected]
博客:linuxfocus.blog.chinaunix.net
在前面的分析数据包接收流程的博文中,都是针对的UDP协议。随着这个流程的贯通,对我来讲一个基本的TCP/IP协议栈的主干已经创建了。后面的学习过程就是从这个主干不断地延伸分支,最后形成一棵完整的TCP/IP协议栈的流程图。
今天时间不多,就延伸一个L4 TCP如何选择正确的socket接收数据吧。在inet_init()中通过tcp_protocol注册了TCP 数据包的处理函数tcp_rcv。下面就由它开始:
- int tcp_v4_rcv(struct sk_buff *skb)
- {
- const struct iphdr *iph;
- struct tcphdr *th;
- struct sock *sk;
- int ret;
- struct net *net = dev_net(skb->dev);
/*
TCP是面向连接的协议,也就是端对端。如果包的类型不是HOST,当然不正确了,
所以直接drop
*/
- if (skb->pkt_type != PACKET_HOST)
- goto discard_it;
- /* Count it even if it's bad */
- /* 更新统计信息 */
- TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
- if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
- goto discard_it;
/* 得到TCP报文头 */
- th = tcp_hdr(skb);
/* 做sanity check */
- if (th->doff < sizeof(struct tcphdr) / 4)
- goto bad_packet;
- if (!pskb_may_pull(skb, th->doff * 4))
- goto discard_it;
- /* An explanation is required here, I think.
- * Packet length and doff are validated by header prediction,
- * provided case of th->doff==0 is eliminated.
- * So, we defer the checks. */
- /* 检测checksum */
- if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
- goto bad_packet;
/* 得到TCP的seq,ack等 */- th = tcp_hdr(skb);
- iph = ip_hdr(skb);
- TCP_SKB_CB(skb)->seq = ntohl(th->seq);
- TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
- skb->len - th->doff * 4);
- TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
- TCP_SKB_CB(skb)->when = 0;
- TCP_SKB_CB(skb)->flags = iph->tos;
- TCP_SKB_CB(skb)->sacked = 0;
/* 查找对应的socket */
- sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
- if (!sk)
- goto no_tcp_socket;
/* 后面的暂不关心 */
- ...... ......
- }
进入__inet_lookup_skb->__inet_lookup
- static inline struct sock *__inet_lookup(struct net *net,
- struct inet_hashinfo *hashinfo,
- const __be32 saddr, const __be16 sport,
- const __be32 daddr, const __be16 dport,
- const int dif)
- {
- u16 hnum = ntohs(dport);
- /* 先尝试查找处于连接成功的socket */
- struct sock *sk = __inet_lookup_established(net, hashinfo,
- saddr, sport, daddr, hnum, dif);
/* 如果没有找到连接成功的socket,那么就去处于listen状态的socket查找 */
- return sk ? : __inet_lookup_listener(net, hashinfo, daddr, hnum, dif);
- }
先看__inet_lookup_established
- struct sock * __inet_lookup_established(struct net *net,
- struct inet_hashinfo *hashinfo,
- const __be32 saddr, const __be16 sport,
- const __be32 daddr, const u16 hnum,
- const int dif)
- {
- INET_ADDR_COOKIE(acookie, saddr, daddr)
- const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
- struct sock *sk;
- const struct hlist_nulls_node *node;
- /* Optimize here for direct hit, only listening connections can
- * have wildcards anyways.
- */
- unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
- unsigned int slot = hash & hashinfo->ehash_mask;
/* 获得处于连接状态(established
- struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
- rcu_read_lock();
- begin:
- sk_nulls_for_each_rcu(sk, node, &head->chain) {
- if (INET_MATCH(sk, net, hash, acookie,
- saddr, daddr, ports, dif)) {
- /* 地址端口等均匹配 */
/* 大致的看了一下进入TIME_WAIT的函数,当socket进入TW时,并没有从ehash中移除,所以可能需 要检查TW*/
- if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
- goto begintw; /* 该socket的引用计数为0,必须检测是处于TIME_WAIT状态 */
/*
这里为什么要两次检验:通过google搜索,直到当时加上2次检验的原因是因为RCU的缘故。想了半天,终于明白了。在第一次INET_MATCH时,该sk还没有被hold。只有执行了atomic_inc_not_zero,才相当于hold了这个sk。但是正常的RCU的操作,应该是先hold,才能保证内容没有变化。所以需要二次判断。
*/
- if (unlikely(!INET_MATCH(sk, net, hash, acookie,
- saddr, daddr, ports, dif))) {
- sock_put(sk);
- goto begin;
- }
/* 找到了socket */
- goto out;
- }
- }
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot)
- goto begin;
- begintw:
- /* Must check for a TIME_WAIT'er before going to listener hash. */
- /* 确保该socket不处于TIME_WAIT状态 */
- sk_nulls_for_each_rcu(sk, node, &head->twchain) {
- if (INET_TW_MATCH(sk, net, hash, acookie,
- saddr, daddr, ports, dif)) {
- if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
- /*
- 该socket的引用计数为0,那么意味着该socket已经无人使用,所以可视为该socket无效。
- */
- sk = NULL;
- goto out;
- }
/* 二次比较。原因同上 */
- if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie,
- saddr, daddr, ports, dif))) {
- sock_put(sk);
- goto begintw;
- }
- goto out;
- }
- }
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot)
- goto begintw;
- sk = NULL;
- out:
- rcu_read_unlock();
- return sk;
- }
这是到连接成功的socket的查找,下面是处于listen状态的查找。
- struct sock *__inet_lookup_listener(struct net *net,
- struct inet_hashinfo *hashinfo,
- const __be32 daddr, const unsigned short hnum,
- const int dif)
- {
- struct sock *sk, *result;
- struct hlist_nulls_node *node;
- unsigned int hash = inet_lhashfn(net, hnum);
- struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
- int score, hiscore;
- rcu_read_lock();
- begin:
- result = NULL;
- hiscore = -1;
- /* 这里的查找与UDP相似,都是计算匹配的得分,取最佳匹配的socket */
- sk_nulls_for_each_rcu(sk, node, &ilb->head) {
- score = compute_score(sk, net, hnum, daddr, dif);
- if (score > hiscore) {
- result = sk;
- hiscore = score;
- }
- }
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
- goto begin;
- if (result) {
- /* 如果该socket已不再被使用,则放弃这个socket */
- if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
- result = NULL;
- else if (unlikely(compute_score(result, net, hnum, daddr,
- dif) < hiscore)) { /* 仍然是二次计算,原因仍然同上 */
- sock_put(result);
- goto begin;
- }
- }
- rcu_read_unlock();
- return result;
- }
到此,TCP数据包选择对应socket的过程已经完成。
今天学习的东西看上去很简单,但是居然遇到了问题——为什么TCP中,要有这种二次重复计算呢?睡了一晚上,终于想明白了。已经在原代码中更新了原因。