TCP的三个接收队列

之前对于TCP接收过程中的三个队列的关系之前没搞清楚。

这几天，在同事邱的帮助下，终于把关系理清了，故特此做个笔记。

一、在软中断中加入数据包

tcp_v4_rcv()函数是tcp层收包的入口。

 int tcp_v4_rcv(struct sk_buff *skb)

 {

         const struct iphdr *iph;

         struct tcphdr *th;

         struct sock *sk;

         int ret;

         struct net *net = dev_net(skb->dev);

         if (skb->pkt_type != PACKET_HOST)

                 goto discard_it;

         /* Count it even if it's bad */

         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);

         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))

                 goto discard_it;

         th = tcp_hdr(skb);

         if (th->doff < sizeof(struct tcphdr) / )

                 goto bad_packet;

         if (!pskb_may_pull(skb, th->doff * ))

                 goto discard_it;

         /* An explanation is required here, I think.

1640          * Packet length and doff are validated by header prediction,

1641          * provided case of th->doff==0 is eliminated.

1642          * So, we defer the checks. */

         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))

                 goto bad_packet;

         th = tcp_hdr(skb);

         iph = ip_hdr(skb);

         TCP_SKB_CB(skb)->seq = ntohl(th->seq);

         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +

                                     skb->len - th->doff * );

         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);

         TCP_SKB_CB(skb)->when    = ;

         TCP_SKB_CB(skb)->flags   = iph->tos;

         TCP_SKB_CB(skb)->sacked  = ;

         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);

         if (!sk)

                 goto no_tcp_socket;

 process:

         if (sk->sk_state == TCP_TIME_WAIT)

                 goto do_time_wait;

         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {

                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);

                 goto discard_and_relse;

         }

         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))

                 goto discard_and_relse;

         nf_reset(skb);

         if (sk_filter(sk, skb))

                 goto discard_and_relse;

         skb->dev = NULL;

         bh_lock_sock_nested(sk);

         ret = ;

         if (!sock_owned_by_user(sk)) {

 #ifdef CONFIG_NET_DMA

                 struct tcp_sock *tp = tcp_sk(sk);

                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)

                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);

                 if (tp->ucopy.dma_chan)

                         ret = tcp_v4_do_rcv(sk, skb);

                 else

 #endif

                 {

                         if (!tcp_prequeue(sk, skb))   //先尝试加入prequeue

                                 ret = tcp_v4_do_rcv(sk, skb);  //否则加入sk_receive_queue

                 }

         } else if (unlikely(sk_add_backlog(sk, skb))) {  //加入backlog

                 bh_unlock_sock(sk);

                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);

                 goto discard_and_relse;

         }

         bh_unlock_sock(sk);

         sock_put(sk);

         return ret;

 no_tcp_socket:

         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))

                 goto discard_it;

         if (skb->len < (th->doff << ) || tcp_checksum_complete(skb)) {

 bad_packet:

                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);

         } else {

                 tcp_v4_send_reset(NULL, skb);

         }

 discard_it:

         /* Discard frame. */

         kfree_skb(skb);

         return ;

 discard_and_relse:

         sock_put(sk);

         goto discard_it;

 do_time_wait:

         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {

                 inet_twsk_put(inet_twsk(sk));

                 goto discard_it;

         }

         if (skb->len < (th->doff << ) || tcp_checksum_complete(skb)) {

                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);

                 inet_twsk_put(inet_twsk(sk));

                 goto discard_it;

         }

         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {

         case TCP_TW_SYN: {

                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),

                                                         &tcp_hashinfo,

                                                         iph->daddr, th->dest,

                                                         inet_iif(skb));

                 if (sk2) {

                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);

                         inet_twsk_put(inet_twsk(sk));

                         sk = sk2;

                         goto process;

                 }

                 /* Fall through to ACK */

         }

         case TCP_TW_ACK:

                 tcp_v4_timewait_ack(sk, skb);

                 break;

         case TCP_TW_RST:

                 goto no_tcp_socket;

         case TCP_TW_SUCCESS:;

         }

         goto discard_it;

 }

跟踪tcp_prequeue（）函数

 static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb)

 {

         struct tcp_sock *tp = tcp_sk(sk);

         if (sysctl_tcp_low_latency || !tp->ucopy.task)

                 return ;

         __skb_queue_tail(&tp->ucopy.prequeue, skb);

         tp->ucopy.memory += skb->truesize;

         if (tp->ucopy.memory > sk->sk_rcvbuf) {  //如果prequeue的内存使用紧张

                 struct sk_buff *skb1;

                 BUG_ON(sock_owned_by_user(sk));

                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) { //遍历prequeue中的数据包

                         sk_backlog_rcv(sk, skb1);  //处理数据包，即把prequeue中的数据包转移到sk_receive_queue中

                         NET_INC_STATS_BH(sock_net(sk),

                                          LINUX_MIB_TCPPREQUEUEDROPPED);

                 }

                 tp->ucopy.memory = ;

         } else if (skb_queue_len(&tp->ucopy.prequeue) == ) {

                 wake_up_interruptible_sync_poll(sk->sk_sleep,

                                            POLLIN | POLLRDNORM | POLLRDBAND);

                 if (!inet_csk_ack_scheduled(sk))

                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,

                                                   ( * tcp_rto_min(sk)) / ,

                                                   TCP_RTO_MAX);

         }

         return ;

 }

 static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)

 {

         return sk->sk_backlog_rcv(sk, skb);  //实际回调函数tcp_v4_do_rcv()

 }

 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)

 {

         struct sock *rsk;

 #ifdef CONFIG_TCP_MD5SIG

         /*

1551          * We really want to reject the packet as early as possible

1552          * if:

1553          *  o We're expecting an MD5'd packet and this is no MD5 tcp option

1554          *  o There is an MD5 option and we're not expecting one

1555          */

         if (tcp_v4_inbound_md5_hash(sk, skb))

                 goto discard;

 #endif

         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path  已经建立TCP连接时 */

                 TCP_CHECK_TIMER(sk);

                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {  //主要处理函数

                         rsk = sk;

                         goto reset;

                 }

                 TCP_CHECK_TIMER(sk);

                 return ;

         }

         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))

                 goto csum_err;

         if (sk->sk_state == TCP_LISTEN) {

                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);

                 if (!nsk)

                         goto discard;

                 if (nsk != sk) {

                         if (tcp_child_process(sk, nsk, skb)) {

                                 rsk = nsk;

                                 goto reset;

                         }

                         return ;

                 }

         }

         TCP_CHECK_TIMER(sk);

         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {

                 rsk = sk;

                 goto reset;

         }

         TCP_CHECK_TIMER(sk);

         return ;

 reset:

         tcp_v4_send_reset(rsk, skb);

 discard:

         kfree_skb(skb);

         /* Be careful here. If this function gets more complicated and

1600          * gcc suffers from register pressure on the x86, sk (in %ebx)

1601          * might be destroyed here. This current version compiles correctly,

1602          * but you have been warned.

1603          */

         return ;

 csum_err:

         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);

         goto discard;

 }

 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,

                         struct tcphdr *th, unsigned len)

 {

         struct tcp_sock *tp = tcp_sk(sk);

         int res;

         /*

5231          *      Header prediction.

5232          *      The code loosely follows the one in the famous

5233          *      "30 instruction TCP receive" Van Jacobson mail.

5234          *

5235          *      Van's trick is to deposit buffers into socket queue

5236          *      on a device interrupt, to call tcp_recv function

5237          *      on the receive process context and checksum and copy

5238          *      the buffer to user space. smart...

5239          *

5240          *      Our current scheme is not silly either but we take the

5241          *      extra cost of the net_bh soft interrupt processing...

5242          *      We do checksum and copy also but from device to kernel.

5243          */

         tp->rx_opt.saw_tstamp = ;

         /*      pred_flags is 0xS?10 << 16 + snd_wnd

5248          *      if header_prediction is to be made

5249          *      'S' will always be tp->tcp_header_len >> 2

5250          *      '?' will be 0 for the fast path, otherwise pred_flags is 0 to

5251          *  turn it off (when there are holes in the receive

5252          *       space for instance)

5253          *      PSH flag is ignored.

5254          */

         if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&

             TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&

             !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {

                 int tcp_header_len = tp->tcp_header_len;

                 /* Timestamp header prediction: tcp_header_len

5262                  * is automatically equal to th->doff*4 due to pred_flags

5263                  * match.

5264                  */

                 /* Check timestamp */

                 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {

                         /* No? Slow path! */

                         if (!tcp_parse_aligned_timestamp(tp, th))

                                 goto slow_path;

                         /* If PAWS failed, check it more carefully in slow path */

                         if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < )

                                 goto slow_path;

                         /* DO NOT update ts_recent here, if checksum fails

5277                          * and timestamp was corrupted part, it will result

5278                          * in a hung connection since we will drop all

5279                          * future packets due to the PAWS test.

5280                          */

                 }

                 if (len <= tcp_header_len) {

                         /* Bulk data transfer: sender */

                         if (len == tcp_header_len) {

                                 /* Predicted packet is in window by definition.

5287                                  * seq == rcv_nxt and rcv_wup <= rcv_nxt.

5288                                  * Hence, check seq<=rcv_wup reduces to:

5289                                  */

                                 if (tcp_header_len ==

                                     (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&

                                     tp->rcv_nxt == tp->rcv_wup)

                                         tcp_store_ts_recent(tp);

                                 /* We know that such packets are checksummed

5296                                  * on entry.

5297                                  */

                                 tcp_ack(sk, skb, );

                                 __kfree_skb(skb);

                                 tcp_data_snd_check(sk);

                                 return ;

                         } else { /* Header too small */

                                 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);

                                 goto discard;

                         }

                 } else {

                         int eaten = ;

                         int copied_early = ;

                         if (tp->copied_seq == tp->rcv_nxt &&

                             len - tcp_header_len <= tp->ucopy.len) {

 #ifdef CONFIG_NET_DMA

                                 if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {

                                         copied_early = ;

                                         eaten = ;

                                 }

 #endif

                                 if (tp->ucopy.task == current &&

                                     sock_owned_by_user(sk) && !copied_early) {

                                         __set_current_state(TASK_RUNNING);

                                         if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))

                                                 eaten = ;

                                 }

                                 if (eaten) {

                                         /* Predicted packet is in window by definition.

5327                                          * seq == rcv_nxt and rcv_wup <= rcv_nxt.

5328                                          * Hence, check seq<=rcv_wup reduces to:

5329                                          */

                                         if (tcp_header_len ==

                                             (sizeof(struct tcphdr) +

                                              TCPOLEN_TSTAMP_ALIGNED) &&

                                             tp->rcv_nxt == tp->rcv_wup)

                                                 tcp_store_ts_recent(tp);

                                         tcp_rcv_rtt_measure_ts(sk, skb);

                                         __skb_pull(skb, tcp_header_len);

                                         tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;

                                         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);

                                 }

                                 if (copied_early)

                                         tcp_cleanup_rbuf(sk, skb->len);

                         }

                         if (!eaten) {

                                 if (tcp_checksum_complete_user(sk, skb))

                                         goto csum_error;

                                 /* Predicted packet is in window by definition.

5350                                  * seq == rcv_nxt and rcv_wup <= rcv_nxt.

5351                                  * Hence, check seq<=rcv_wup reduces to:

5352                                  */

                                 if (tcp_header_len ==

                                     (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&

                                     tp->rcv_nxt == tp->rcv_wup)

                                         tcp_store_ts_recent(tp);

                                 tcp_rcv_rtt_measure_ts(sk, skb);

                                 if ((int)skb->truesize > sk->sk_forward_alloc)

                                         goto step5;

                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);

                                 /* Bulk data transfer: receiver */

                                 __skb_pull(skb, tcp_header_len);

                                 __skb_queue_tail(&sk->sk_receive_queue, skb);   //把prequeu中的数据包合并到sk_receive_queue中

                                 skb_set_owner_r(skb, sk);

                                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;

                         }

                         tcp_event_data_recv(sk, skb);

                         if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {

                                 /* Well, only one small jumplet in fast path... */

                                 tcp_ack(sk, skb, FLAG_DATA);

                                 tcp_data_snd_check(sk);

                                 if (!inet_csk_ack_scheduled(sk))

                                         goto no_ack;

                         }

                         if (!copied_early || tp->rcv_nxt != tp->rcv_wup)

                                 __tcp_ack_snd_check(sk, );

 no_ack:

 #ifdef CONFIG_NET_DMA

                         if (copied_early)

                                 __skb_queue_tail(&sk->sk_async_wait_queue, skb);

                         else

 #endif

                         if (eaten)

                                 __kfree_skb(skb);

                         else

                                 sk->sk_data_ready(sk, );

                         return ;

                 }

         }

 slow_path:

         if (len < (th->doff << ) || tcp_checksum_complete_user(sk, skb))

                 goto csum_error;

         /*

5403          *      Standard slow path.

5404          */

         res = tcp_validate_incoming(sk, skb, th, );

         if (res <= )

                 return -res;

 step5:

         if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < )

                 goto discard;

         tcp_rcv_rtt_measure_ts(sk, skb);

         /* Process urgent data. */

         tcp_urg(sk, skb, th);

         /* step 7: process the segment text */

         tcp_data_queue(sk, skb);

         tcp_data_snd_check(sk);

         tcp_ack_snd_check(sk);

         return ;

 csum_error:

         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);

 discard:

         __kfree_skb(skb);

         return ;

 }

二、在进程上下文中

应用层收包函数recvmsg()最终调用内核里面的tcp_recvmsg()。

  *      This routine copies from a sock struct into the user buffer.

  *

  *      Technical note: in 2.3 we work on _locked_ socket, so that

  *      tricks with *seq access order and skb->users are not required.

  *      Probably, code can be easily improved even more.

  */

 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,

                 size_t len, int nonblock, int flags, int *addr_len)

 {

         struct tcp_sock *tp = tcp_sk(sk);

         int copied = ;

         u32 peek_seq;

         u32 *seq;

         unsigned long used;

         int err;

         int target;             /* Read at least this many bytes */

         long timeo;

         struct task_struct *user_recv = NULL;

         int copied_early = ;

         struct sk_buff *skb;

         u32 urg_hole = ;

         lock_sock(sk);

         TCP_CHECK_TIMER(sk);

         err = -ENOTCONN;

         if (sk->sk_state == TCP_LISTEN)

                 goto out;

         timeo = sock_rcvtimeo(sk, nonblock);

         /* Urgent data needs to be handled specially. */

         if (flags & MSG_OOB)

                 goto recv_urg;

         seq = &tp->copied_seq;

         if (flags & MSG_PEEK) {

                 peek_seq = tp->copied_seq;

                 seq = &peek_seq;

         }

         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);

 #ifdef CONFIG_NET_DMA

         tp->ucopy.dma_chan = NULL;

         preempt_disable();

         skb = skb_peek_tail(&sk->sk_receive_queue);

         {

                 int available = ;

                 if (skb)

                         available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);

                 if ((available < target) &&

                     (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&

                     !sysctl_tcp_low_latency &&

                     dma_find_channel(DMA_MEMCPY)) {

                         preempt_enable_no_resched();

                         tp->ucopy.pinned_list =

                                         dma_pin_iovec_pages(msg->msg_iov, len);

                 } else {

                         preempt_enable_no_resched();

                 }

         }

 #endif

         do {

                 u32 offset;

                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */

                 if (tp->urg_data && tp->urg_seq == *seq) {

                         if (copied)

                                 break;

                         if (signal_pending(current)) {

                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;

                                 break;

                         }

                 }

                 /* Next get a buffer. */

                 skb_queue_walk(&sk->sk_receive_queue, skb) {  //从sk_receive_queue中依次获得待读取的段

                         /* Now that we have two receive queues this

1469                          * shouldn't happen.

1470                          */

                         if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),

                              KERN_INFO "recvmsg bug: copied %X "

                                        "seq %X rcvnxt %X fl %X\n", *seq,

                                        TCP_SKB_CB(skb)->seq, tp->rcv_nxt,

                                        flags))

                                 break;

                         offset = *seq - TCP_SKB_CB(skb)->seq;

                         if (tcp_hdr(skb)->syn)

                                 offset--;

                         if (offset < skb->len)

                                 goto found_ok_skb;

                         if (tcp_hdr(skb)->fin)

                                 goto found_fin_ok;

                         WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: "

                                         "copied %X seq %X rcvnxt %X fl %X\n",

                                         *seq, TCP_SKB_CB(skb)->seq,

                                         tp->rcv_nxt, flags);

                 }

                 /* Well, if we have backlog, try to process it now yet. */

                 if (copied >= target && !sk->sk_backlog.tail)   //如果backlog不为空

                         break;                  //跳出循环，处理backlog

                 if (copied) {

                         if (sk->sk_err ||

                             sk->sk_state == TCP_CLOSE ||

                             (sk->sk_shutdown & RCV_SHUTDOWN) ||

                             !timeo ||

                             signal_pending(current))

                                 break;

                 } else {

                         if (sock_flag(sk, SOCK_DONE))

                                 break;

                         if (sk->sk_err) {

                                 copied = sock_error(sk);

                                 break;

                         }

                         if (sk->sk_shutdown & RCV_SHUTDOWN)

                                 break;

                         if (sk->sk_state == TCP_CLOSE) {

                                 if (!sock_flag(sk, SOCK_DONE)) {

                                         /* This occurs when user tries to read

1518                                          * from never connected socket.

1519                                          */

                                         copied = -ENOTCONN;

                                         break;

                                 }

                                 break;

                         }

                         if (!timeo) {

                                 copied = -EAGAIN;

                                 break;

                         }

                         if (signal_pending(current)) {

                                 copied = sock_intr_errno(timeo);

                                 break;

                         }

                 }

                 tcp_cleanup_rbuf(sk, copied);

                 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {

                         /* Install new reader */

                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {

                                 user_recv = current;

                                 tp->ucopy.task = user_recv;

                                 tp->ucopy.iov = msg->msg_iov;

                         }

                         tp->ucopy.len = len;

                         WARN_ON(tp->copied_seq != tp->rcv_nxt &&

                                 !(flags & (MSG_PEEK | MSG_TRUNC)));

                         /* Ugly... If prequeue is not empty, we have to

1553                          * process it before releasing socket, otherwise

1554                          * order will be broken at second iteration.

1555                          * More elegant solution is required!!!

1556                          *

1557                          * Look: we have the following (pseudo)queues:

1558                          *

1559                          * 1. packets in flight

1560                          * 2. backlog

1561                          * 3. prequeue

1562                          * 4. receive_queue

1563                          *

1564                          * Each queue can be processed only if the next ones

1565                          * are empty. At this point we have empty receive_queue.

1566                          * But prequeue _can_ be not empty after 2nd iteration,

1567                          * when we jumped to start of loop because backlog

1568                          * processing added something to receive_queue.

1569                          * We cannot release_sock(), because backlog contains

1570                          * packets arrived _after_ prequeued ones.

1571                          *

1572                          * Shortly, algorithm is clear --- to process all

1573                          * the queues in order. We could make it more directly,

1574                          * requeueing packets from backlog to prequeue, if

1575                          * is not empty. It is more elegant, but eats cycles,

1576                          * unfortunately.

1577                          */

                         if (!skb_queue_empty(&tp->ucopy.prequeue))

                                 goto do_prequeue;  //处理prequeue

                         /* __ Set realtime policy in scheduler __ */

                 }

 #ifdef CONFIG_NET_DMA

                 if (tp->ucopy.dma_chan)

                         dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);

 #endif

                 if (copied >= target) {

                         /* Do not sleep, just process backlog. */

                         release_sock(sk);

                         lock_sock(sk);

                 } else

                         sk_wait_data(sk, &timeo);

 #ifdef CONFIG_NET_DMA

                 tcp_service_net_dma(sk, false);  /* Don't block */

                 tp->ucopy.wakeup = ;

 #endif

                 if (user_recv) {

                         int chunk;

                         /* __ Restore normal policy in scheduler __ */

                         if ((chunk = len - tp->ucopy.len) != ) {

                                 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);

                                 len -= chunk;

                                 copied += chunk;

                         }

                         if (tp->rcv_nxt == tp->copied_seq &&

                             !skb_queue_empty(&tp->ucopy.prequeue)) {

 do_prequeue:

                                 tcp_prequeue_process(sk);   //prequeue的处理函数

                                 if ((chunk = len - tp->ucopy.len) != ) {

                                         NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);

                                         len -= chunk;

                                         copied += chunk;

                                 }

                         }

                 }

                 if ((flags & MSG_PEEK) &&

                     (peek_seq - copied - urg_hole != tp->copied_seq)) {

                         if (net_ratelimit())

                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",

                                        current->comm, task_pid_nr(current));

                         peek_seq = tp->copied_seq;

                 }

                 continue;

         found_ok_skb:

                 /* Ok so how much can we use? */

                 used = skb->len - offset;

                 if (len < used)

                         used = len;

                 /* Do we have urgent data here? */

                 if (tp->urg_data) {

                         u32 urg_offset = tp->urg_seq - *seq;

                         if (urg_offset < used) {

                                 if (!urg_offset) {

                                         if (!sock_flag(sk, SOCK_URGINLINE)) {

                                                 ++*seq;

                                                 urg_hole++;

                                                 offset++;

                                                 used--;

                                                 if (!used)

                                                         goto skip_copy;

                                         }

                                 } else

                                         used = urg_offset;

                         }

                 }

                 if (!(flags & MSG_TRUNC)) {

 #ifdef CONFIG_NET_DMA

                         if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)

                                 tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);

                         if (tp->ucopy.dma_chan) {

                                 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(

                                         tp->ucopy.dma_chan, skb, offset,

                                         msg->msg_iov, used,

                                         tp->ucopy.pinned_list);

                                 if (tp->ucopy.dma_cookie < ) {

                                         printk(KERN_ALERT "dma_cookie < 0\n");

                                         /* Exception. Bailout! */

                                         if (!copied)

                                                 copied = -EFAULT;

                                         break;

                                 }

                                 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);

                                 if ((offset + used) == skb->len)

                                         copied_early = ;

                         } else

 #endif

                         {

                                 err = skb_copy_datagram_iovec(skb, offset,

                                                 msg->msg_iov, used);

                                 if (err) {

                                         /* Exception. Bailout! */

                                         if (!copied)

                                                 copied = -EFAULT;

                                         break;

                                 }

                         }

                 }

                 *seq += used;

                 copied += used;

                 len -= used;

                 tcp_rcv_space_adjust(sk);

 skip_copy:

                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {

                         tp->urg_data = ;

                         tcp_fast_path_check(sk);

                 }

                 if (used + offset < skb->len)

                         continue;

                 if (tcp_hdr(skb)->fin)

                         goto found_fin_ok;

                 if (!(flags & MSG_PEEK)) {

                         sk_eat_skb(sk, skb, copied_early);

                         copied_early = ;

                 }

                 continue;

         found_fin_ok:

                 /* Process the FIN. */

                 ++*seq;

                 if (!(flags & MSG_PEEK)) {

                         sk_eat_skb(sk, skb, copied_early);

                         copied_early = ;

                 }

                 break;

         } while (len > );

         if (user_recv) {

                 if (!skb_queue_empty(&tp->ucopy.prequeue)) {

                         int chunk;

                         tp->ucopy.len = copied >  ? len : ;

                         tcp_prequeue_process(sk);

                         if (copied >  && (chunk = len - tp->ucopy.len) != ) {

                                 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);

                                 len -= chunk;

                                 copied += chunk;

                         }

                 }

                 tp->ucopy.task = NULL;

                 tp->ucopy.len = ;

         }

 #ifdef CONFIG_NET_DMA

         tcp_service_net_dma(sk, true);  /* Wait for queue to drain */

         tp->ucopy.dma_chan = NULL;

         if (tp->ucopy.pinned_list) {

                 dma_unpin_iovec_pages(tp->ucopy.pinned_list);

                 tp->ucopy.pinned_list = NULL;

         }

 #endif

         /* According to UNIX98, msg_name/msg_namelen are ignored

1758          * on connected socket. I was just happy when found this 8) --ANK

1759          */

         /* Clean up data we have read: This will do ACK frames. */

         tcp_cleanup_rbuf(sk, copied);

         TCP_CHECK_TIMER(sk);

         release_sock(sk);

         return copied;

 out:

         TCP_CHECK_TIMER(sk);

         release_sock(sk);           //backlog的处理函数

         return err;

 recv_urg:

         err = tcp_recv_urg(sk, msg, len, flags);

         goto out;

 }

跟踪一下tcp_prequeue_process()函数：

 static void tcp_prequeue_process(struct sock *sk)

 {

         struct sk_buff *skb;

         struct tcp_sock *tp = tcp_sk(sk);

         NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);

         /* RX process wants to run with disabled BHs, though it is not

1248          * necessary */

         local_bh_disable();

         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)  //遍历prequeue中的数据包

                 sk_backlog_rcv(sk, skb);  //处理数据包，把prequeue中的数据包合并到sk_receive_queue中

         local_bh_enable();

         /* Clear memory counter. */

         tp->ucopy.memory = ;

 }

再来看看realease_sock()函数：

 void release_sock(struct sock *sk)

 {

         /*

1955          * The sk_lock has mutex_unlock() semantics:

1956          */

         mutex_release(&sk->sk_lock.dep_map, , _RET_IP_);

         spin_lock_bh(&sk->sk_lock.slock);

         if (sk->sk_backlog.tail)  //如果backlog中有数据包

                 __release_sock(sk);  //实际处理函数

         sk->sk_lock.owned = ;

         if (waitqueue_active(&sk->sk_lock.wq))

                 wake_up(&sk->sk_lock.wq);

         spin_unlock_bh(&sk->sk_lock.slock);

 }

 static void __release_sock(struct sock *sk)

 {

         struct sk_buff *skb = sk->sk_backlog.head;  //保存sk_backlog.head

         do {

                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;  // 把head和tail置空

                 bh_unlock_sock(sk);   //解除自旋锁，对smp有效

                 do {

                         struct sk_buff *next = skb->next;

                         skb->next = NULL;

                         sk_backlog_rcv(sk, skb);  //调用tcp_v4_do_rcv()->tcp_rcv_established()处理数据包，
　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　//把backlog中的数据包合并到sk_receive_queue中



                         /*

1538                          * We are in process context here with softirqs

1539                          * disabled, use cond_resched_softirq() to preempt.

1540                          * This is safe to do because we've taken the backlog

1541                          * queue private:

1542                          */

                         cond_resched_softirq();

                         skb = next;  //依次处理下一个数据包

                 } while (skb != NULL);  //遍历backlog中的数据包

                 bh_lock_sock(sk);

         } while ((skb = sk->sk_backlog.head) != NULL); //这里纠结了好久

　　　　　　　　　　/*这个外层循环这么理解:
　　　　　　　　　　 * __release_sock()是在进程上下文中，sk_add_backlog()是在软中断中，
　　　　　　　　　　 * 由于优先处理软中断，所以在__release_sock()处理数据包的时候，
　　　　　　　　　　 * 有可能一直有新的数据包加入backlog，导致sk->sk_backlog.head不为空，
　　　　　　　　　　 * 除非没有新数据包加入backlog，这个时候才立即停止外层循环。
　　　　　　　　　　 */

         /*

1552          * Doing the zeroing here guarantee we can not loop forever

1553          * while a wild producer attempts to flood us.

1554          */

         sk->sk_backlog.len = ;

 }