之前对于TCP接收过程中的三个队列的关系之前没搞清楚。

这几天,在同事邱的帮助下,终于把关系理清了,故特此做个笔记。

一、在软中断中加入数据包

tcp_v4_rcv()函数是tcp层收包的入口。

 int tcp_v4_rcv(struct sk_buff *skb)
{
const struct iphdr *iph;
struct tcphdr *th;
struct sock *sk;
int ret;
struct net *net = dev_net(skb->dev); if (skb->pkt_type != PACKET_HOST)
goto discard_it; /* Count it even if it's bad */
TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
goto discard_it; th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / )
goto bad_packet;
if (!pskb_may_pull(skb, th->doff * ))
goto discard_it; /* An explanation is required here, I think.
1640 * Packet length and doff are validated by header prediction,
1641 * provided case of th->doff==0 is eliminated.
1642 * So, we defer the checks. */
if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
goto bad_packet; th = tcp_hdr(skb);
iph = ip_hdr(skb);
TCP_SKB_CB(skb)->seq = ntohl(th->seq);
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
skb->len - th->doff * );
TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
TCP_SKB_CB(skb)->when = ;
TCP_SKB_CB(skb)->flags = iph->tos;
TCP_SKB_CB(skb)->sacked = ; sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
if (!sk)
goto no_tcp_socket; process:
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait; if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
} if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
nf_reset(skb); if (sk_filter(sk, skb))
goto discard_and_relse; skb->dev = NULL; bh_lock_sock_nested(sk);
ret = ;
if (!sock_owned_by_user(sk)) {
#ifdef CONFIG_NET_DMA
struct tcp_sock *tp = tcp_sk(sk);
if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
if (tp->ucopy.dma_chan)
ret = tcp_v4_do_rcv(sk, skb);
else
#endif
{
if (!tcp_prequeue(sk, skb)) //先尝试加入prequeue
ret = tcp_v4_do_rcv(sk, skb); //否则加入sk_receive_queue
}
} else if (unlikely(sk_add_backlog(sk, skb))) { //加入backlog
bh_unlock_sock(sk);
NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
goto discard_and_relse;
}
bh_unlock_sock(sk); sock_put(sk); return ret; no_tcp_socket:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it; if (skb->len < (th->doff << ) || tcp_checksum_complete(skb)) {
bad_packet:
TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
} else {
tcp_v4_send_reset(NULL, skb);
} discard_it:
/* Discard frame. */
kfree_skb(skb);
return ; discard_and_relse:
sock_put(sk);
goto discard_it; do_time_wait:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
inet_twsk_put(inet_twsk(sk));
goto discard_it;
} if (skb->len < (th->doff << ) || tcp_checksum_complete(skb)) {
TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
inet_twsk_put(inet_twsk(sk));
goto discard_it;
}
switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
case TCP_TW_SYN: {
struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
&tcp_hashinfo,
iph->daddr, th->dest,
inet_iif(skb));
if (sk2) {
inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
inet_twsk_put(inet_twsk(sk));
sk = sk2;
goto process;
}
/* Fall through to ACK */
}
case TCP_TW_ACK:
tcp_v4_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
goto no_tcp_socket;
case TCP_TW_SUCCESS:;
}
goto discard_it;
}

跟踪tcp_prequeue()函数

 static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk); if (sysctl_tcp_low_latency || !tp->ucopy.task)
return ; __skb_queue_tail(&tp->ucopy.prequeue, skb);
tp->ucopy.memory += skb->truesize;
if (tp->ucopy.memory > sk->sk_rcvbuf) { //如果prequeue的内存使用紧张
struct sk_buff *skb1; BUG_ON(sock_owned_by_user(sk)); while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) { //遍历prequeue中的数据包
sk_backlog_rcv(sk, skb1); //处理数据包,即把prequeue中的数据包转移到sk_receive_queue中
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPPREQUEUEDROPPED);
} tp->ucopy.memory = ;
} else if (skb_queue_len(&tp->ucopy.prequeue) == ) {
wake_up_interruptible_sync_poll(sk->sk_sleep,
POLLIN | POLLRDNORM | POLLRDBAND);
if (!inet_csk_ack_scheduled(sk))
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
( * tcp_rto_min(sk)) / ,
TCP_RTO_MAX);
}
return ;
}
 static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
return sk->sk_backlog_rcv(sk, skb); //实际回调函数tcp_v4_do_rcv()
}
 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;
#ifdef CONFIG_TCP_MD5SIG
/*
1551 * We really want to reject the packet as early as possible
1552 * if:
1553 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1554 * o There is an MD5 option and we're not expecting one
1555 */
if (tcp_v4_inbound_md5_hash(sk, skb))
goto discard;
#endif if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path 已经建立TCP连接时 */
TCP_CHECK_TIMER(sk);
if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { //主要处理函数
rsk = sk;
goto reset;
}
TCP_CHECK_TIMER(sk);
return ;
} if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
goto csum_err; if (sk->sk_state == TCP_LISTEN) {
struct sock *nsk = tcp_v4_hnd_req(sk, skb);
if (!nsk)
goto discard; if (nsk != sk) {
if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk;
goto reset;
}
return ;
}
} TCP_CHECK_TIMER(sk);
if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
rsk = sk;
goto reset;
}
TCP_CHECK_TIMER(sk);
return ; reset:
tcp_v4_send_reset(rsk, skb);
discard:
kfree_skb(skb);
/* Be careful here. If this function gets more complicated and
1600 * gcc suffers from register pressure on the x86, sk (in %ebx)
1601 * might be destroyed here. This current version compiles correctly,
1602 * but you have been warned.
1603 */
return ; csum_err:
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
struct tcp_sock *tp = tcp_sk(sk);
int res; /*
5231 * Header prediction.
5232 * The code loosely follows the one in the famous
5233 * "30 instruction TCP receive" Van Jacobson mail.
5234 *
5235 * Van's trick is to deposit buffers into socket queue
5236 * on a device interrupt, to call tcp_recv function
5237 * on the receive process context and checksum and copy
5238 * the buffer to user space. smart...
5239 *
5240 * Our current scheme is not silly either but we take the
5241 * extra cost of the net_bh soft interrupt processing...
5242 * We do checksum and copy also but from device to kernel.
5243 */ tp->rx_opt.saw_tstamp = ; /* pred_flags is 0xS?10 << 16 + snd_wnd
5248 * if header_prediction is to be made
5249 * 'S' will always be tp->tcp_header_len >> 2
5250 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
5251 * turn it off (when there are holes in the receive
5252 * space for instance)
5253 * PSH flag is ignored.
5254 */ if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
int tcp_header_len = tp->tcp_header_len; /* Timestamp header prediction: tcp_header_len
5262 * is automatically equal to th->doff*4 due to pred_flags
5263 * match.
5264 */ /* Check timestamp */
if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
/* No? Slow path! */
if (!tcp_parse_aligned_timestamp(tp, th))
goto slow_path; /* If PAWS failed, check it more carefully in slow path */
if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < )
goto slow_path; /* DO NOT update ts_recent here, if checksum fails
5277 * and timestamp was corrupted part, it will result
5278 * in a hung connection since we will drop all
5279 * future packets due to the PAWS test.
5280 */
} if (len <= tcp_header_len) {
/* Bulk data transfer: sender */
if (len == tcp_header_len) {
/* Predicted packet is in window by definition.
5287 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
5288 * Hence, check seq<=rcv_wup reduces to:
5289 */
if (tcp_header_len ==
(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp); /* We know that such packets are checksummed
5296 * on entry.
5297 */
tcp_ack(sk, skb, );
__kfree_skb(skb);
tcp_data_snd_check(sk);
return ;
} else { /* Header too small */
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
} else {
int eaten = ;
int copied_early = ; if (tp->copied_seq == tp->rcv_nxt &&
len - tcp_header_len <= tp->ucopy.len) {
#ifdef CONFIG_NET_DMA
if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
copied_early = ;
eaten = ;
}
#endif
if (tp->ucopy.task == current &&
sock_owned_by_user(sk) && !copied_early) {
__set_current_state(TASK_RUNNING); if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
eaten = ;
}
if (eaten) {
/* Predicted packet is in window by definition.
5327 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
5328 * Hence, check seq<=rcv_wup reduces to:
5329 */
if (tcp_header_len ==
(sizeof(struct tcphdr) +
TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp); tcp_rcv_rtt_measure_ts(sk, skb); __skb_pull(skb, tcp_header_len);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
}
if (copied_early)
tcp_cleanup_rbuf(sk, skb->len);
}
if (!eaten) {
if (tcp_checksum_complete_user(sk, skb))
goto csum_error; /* Predicted packet is in window by definition.
5350 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
5351 * Hence, check seq<=rcv_wup reduces to:
5352 */
if (tcp_header_len ==
(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp); tcp_rcv_rtt_measure_ts(sk, skb); if ((int)skb->truesize > sk->sk_forward_alloc)
goto step5; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */
__skb_pull(skb, tcp_header_len);
__skb_queue_tail(&sk->sk_receive_queue, skb); //把prequeu中的数据包合并到sk_receive_queue中
skb_set_owner_r(skb, sk);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
} tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
/* Well, only one small jumplet in fast path... */
tcp_ack(sk, skb, FLAG_DATA);
tcp_data_snd_check(sk);
if (!inet_csk_ack_scheduled(sk))
goto no_ack;
} if (!copied_early || tp->rcv_nxt != tp->rcv_wup)
__tcp_ack_snd_check(sk, );
no_ack:
#ifdef CONFIG_NET_DMA
if (copied_early)
__skb_queue_tail(&sk->sk_async_wait_queue, skb);
else
#endif
if (eaten)
__kfree_skb(skb);
else
sk->sk_data_ready(sk, );
return ;
}
} slow_path:
if (len < (th->doff << ) || tcp_checksum_complete_user(sk, skb))
goto csum_error; /*
5403 * Standard slow path.
5404 */ res = tcp_validate_incoming(sk, skb, th, );
if (res <= )
return -res; step5:
if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < )
goto discard; tcp_rcv_rtt_measure_ts(sk, skb); /* Process urgent data. */
tcp_urg(sk, skb, th); /* step 7: process the segment text */
tcp_data_queue(sk, skb); tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
return ; csum_error:
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); discard:
__kfree_skb(skb);
return ;
}

二、在进程上下文中

应用层收包函数recvmsg()最终调用内核里面的tcp_recvmsg()。

  *      This routine copies from a sock struct into the user buffer.
*
* Technical note: in 2.3 we work on _locked_ socket, so that
* tricks with *seq access order and skb->users are not required.
* Probably, code can be easily improved even more.
*/ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len, int nonblock, int flags, int *addr_len)
{
struct tcp_sock *tp = tcp_sk(sk);
int copied = ;
u32 peek_seq;
u32 *seq;
unsigned long used;
int err;
int target; /* Read at least this many bytes */
long timeo;
struct task_struct *user_recv = NULL;
int copied_early = ;
struct sk_buff *skb;
u32 urg_hole = ; lock_sock(sk); TCP_CHECK_TIMER(sk); err = -ENOTCONN;
if (sk->sk_state == TCP_LISTEN)
goto out; timeo = sock_rcvtimeo(sk, nonblock); /* Urgent data needs to be handled specially. */
if (flags & MSG_OOB)
goto recv_urg; seq = &tp->copied_seq;
if (flags & MSG_PEEK) {
peek_seq = tp->copied_seq;
seq = &peek_seq;
} target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); #ifdef CONFIG_NET_DMA
tp->ucopy.dma_chan = NULL;
preempt_disable();
skb = skb_peek_tail(&sk->sk_receive_queue);
{
int available = ; if (skb)
available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
if ((available < target) &&
(len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
!sysctl_tcp_low_latency &&
dma_find_channel(DMA_MEMCPY)) {
preempt_enable_no_resched();
tp->ucopy.pinned_list =
dma_pin_iovec_pages(msg->msg_iov, len);
} else {
preempt_enable_no_resched();
}
}
#endif do {
u32 offset; /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
if (tp->urg_data && tp->urg_seq == *seq) {
if (copied)
break;
if (signal_pending(current)) {
copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
break;
}
} /* Next get a buffer. */ skb_queue_walk(&sk->sk_receive_queue, skb) { //从sk_receive_queue中依次获得待读取的段
/* Now that we have two receive queues this
1469 * shouldn't happen.
1470 */
if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
KERN_INFO "recvmsg bug: copied %X "
"seq %X rcvnxt %X fl %X\n", *seq,
TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
flags))
break; offset = *seq - TCP_SKB_CB(skb)->seq;
if (tcp_hdr(skb)->syn)
offset--;
if (offset < skb->len)
goto found_ok_skb;
if (tcp_hdr(skb)->fin)
goto found_fin_ok;
WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: "
"copied %X seq %X rcvnxt %X fl %X\n",
*seq, TCP_SKB_CB(skb)->seq,
tp->rcv_nxt, flags);
} /* Well, if we have backlog, try to process it now yet. */ if (copied >= target && !sk->sk_backlog.tail) //如果backlog不为空
break; //跳出循环,处理backlog if (copied) {
if (sk->sk_err ||
sk->sk_state == TCP_CLOSE ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
!timeo ||
signal_pending(current))
break;
} else {
if (sock_flag(sk, SOCK_DONE))
break; if (sk->sk_err) {
copied = sock_error(sk);
break;
} if (sk->sk_shutdown & RCV_SHUTDOWN)
break; if (sk->sk_state == TCP_CLOSE) {
if (!sock_flag(sk, SOCK_DONE)) {
/* This occurs when user tries to read
1518 * from never connected socket.
1519 */
copied = -ENOTCONN;
break;
}
break;
} if (!timeo) {
copied = -EAGAIN;
break;
} if (signal_pending(current)) {
copied = sock_intr_errno(timeo);
break;
}
} tcp_cleanup_rbuf(sk, copied); if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
/* Install new reader */
if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
user_recv = current;
tp->ucopy.task = user_recv;
tp->ucopy.iov = msg->msg_iov;
} tp->ucopy.len = len; WARN_ON(tp->copied_seq != tp->rcv_nxt &&
!(flags & (MSG_PEEK | MSG_TRUNC))); /* Ugly... If prequeue is not empty, we have to
1553 * process it before releasing socket, otherwise
1554 * order will be broken at second iteration.
1555 * More elegant solution is required!!!
1556 *
1557 * Look: we have the following (pseudo)queues:
1558 *
1559 * 1. packets in flight
1560 * 2. backlog
1561 * 3. prequeue
1562 * 4. receive_queue
1563 *
1564 * Each queue can be processed only if the next ones
1565 * are empty. At this point we have empty receive_queue.
1566 * But prequeue _can_ be not empty after 2nd iteration,
1567 * when we jumped to start of loop because backlog
1568 * processing added something to receive_queue.
1569 * We cannot release_sock(), because backlog contains
1570 * packets arrived _after_ prequeued ones.
1571 *
1572 * Shortly, algorithm is clear --- to process all
1573 * the queues in order. We could make it more directly,
1574 * requeueing packets from backlog to prequeue, if
1575 * is not empty. It is more elegant, but eats cycles,
1576 * unfortunately.
1577 */
if (!skb_queue_empty(&tp->ucopy.prequeue))
goto do_prequeue; //处理prequeue /* __ Set realtime policy in scheduler __ */
} #ifdef CONFIG_NET_DMA
if (tp->ucopy.dma_chan)
dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
#endif
if (copied >= target) {
/* Do not sleep, just process backlog. */
release_sock(sk);
lock_sock(sk);
} else
sk_wait_data(sk, &timeo); #ifdef CONFIG_NET_DMA
tcp_service_net_dma(sk, false); /* Don't block */
tp->ucopy.wakeup = ;
#endif if (user_recv) {
int chunk; /* __ Restore normal policy in scheduler __ */ if ((chunk = len - tp->ucopy.len) != ) {
NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
len -= chunk;
copied += chunk;
} if (tp->rcv_nxt == tp->copied_seq &&
!skb_queue_empty(&tp->ucopy.prequeue)) {
do_prequeue:
tcp_prequeue_process(sk); //prequeue的处理函数 if ((chunk = len - tp->ucopy.len) != ) {
NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
}
}
}
if ((flags & MSG_PEEK) &&
(peek_seq - copied - urg_hole != tp->copied_seq)) {
if (net_ratelimit())
printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
current->comm, task_pid_nr(current));
peek_seq = tp->copied_seq;
}
continue; found_ok_skb:
/* Ok so how much can we use? */
used = skb->len - offset;
if (len < used)
used = len; /* Do we have urgent data here? */
if (tp->urg_data) {
u32 urg_offset = tp->urg_seq - *seq;
if (urg_offset < used) {
if (!urg_offset) {
if (!sock_flag(sk, SOCK_URGINLINE)) {
++*seq;
urg_hole++;
offset++;
used--;
if (!used)
goto skip_copy;
}
} else
used = urg_offset;
}
} if (!(flags & MSG_TRUNC)) {
#ifdef CONFIG_NET_DMA
if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); if (tp->ucopy.dma_chan) {
tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
tp->ucopy.dma_chan, skb, offset,
msg->msg_iov, used,
tp->ucopy.pinned_list); if (tp->ucopy.dma_cookie < ) { printk(KERN_ALERT "dma_cookie < 0\n"); /* Exception. Bailout! */
if (!copied)
copied = -EFAULT;
break;
} dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); if ((offset + used) == skb->len)
copied_early = ; } else
#endif
{
err = skb_copy_datagram_iovec(skb, offset,
msg->msg_iov, used);
if (err) {
/* Exception. Bailout! */
if (!copied)
copied = -EFAULT;
break;
}
}
} *seq += used;
copied += used;
len -= used; tcp_rcv_space_adjust(sk); skip_copy:
if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
tp->urg_data = ;
tcp_fast_path_check(sk);
}
if (used + offset < skb->len)
continue; if (tcp_hdr(skb)->fin)
goto found_fin_ok;
if (!(flags & MSG_PEEK)) {
sk_eat_skb(sk, skb, copied_early);
copied_early = ;
}
continue; found_fin_ok:
/* Process the FIN. */
++*seq;
if (!(flags & MSG_PEEK)) {
sk_eat_skb(sk, skb, copied_early);
copied_early = ;
}
break;
} while (len > ); if (user_recv) {
if (!skb_queue_empty(&tp->ucopy.prequeue)) {
int chunk; tp->ucopy.len = copied > ? len : ; tcp_prequeue_process(sk); if (copied > && (chunk = len - tp->ucopy.len) != ) {
NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
}
} tp->ucopy.task = NULL;
tp->ucopy.len = ;
} #ifdef CONFIG_NET_DMA
tcp_service_net_dma(sk, true); /* Wait for queue to drain */
tp->ucopy.dma_chan = NULL; if (tp->ucopy.pinned_list) {
dma_unpin_iovec_pages(tp->ucopy.pinned_list);
tp->ucopy.pinned_list = NULL;
}
#endif /* According to UNIX98, msg_name/msg_namelen are ignored
1758 * on connected socket. I was just happy when found this 8) --ANK
1759 */ /* Clean up data we have read: This will do ACK frames. */
tcp_cleanup_rbuf(sk, copied); TCP_CHECK_TIMER(sk);
release_sock(sk);
return copied; out:
TCP_CHECK_TIMER(sk);
release_sock(sk); //backlog的处理函数
return err; recv_urg:
err = tcp_recv_urg(sk, msg, len, flags);
goto out;
}

跟踪一下tcp_prequeue_process()函数:

 static void tcp_prequeue_process(struct sock *sk)
{
struct sk_buff *skb;
struct tcp_sock *tp = tcp_sk(sk); NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED); /* RX process wants to run with disabled BHs, though it is not
1248 * necessary */
local_bh_disable();
while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) //遍历prequeue中的数据包
sk_backlog_rcv(sk, skb); //处理数据包,把prequeue中的数据包合并到sk_receive_queue中
local_bh_enable(); /* Clear memory counter. */
tp->ucopy.memory = ;
}

再来看看realease_sock()函数:

 void release_sock(struct sock *sk)
{
/*
1955 * The sk_lock has mutex_unlock() semantics:
1956 */
mutex_release(&sk->sk_lock.dep_map, , _RET_IP_); spin_lock_bh(&sk->sk_lock.slock);
if (sk->sk_backlog.tail) //如果backlog中有数据包
__release_sock(sk); //实际处理函数
sk->sk_lock.owned = ;
if (waitqueue_active(&sk->sk_lock.wq))
wake_up(&sk->sk_lock.wq);
spin_unlock_bh(&sk->sk_lock.slock);
}
 static void __release_sock(struct sock *sk)
{
struct sk_buff *skb = sk->sk_backlog.head; //保存sk_backlog.head do {
sk->sk_backlog.head = sk->sk_backlog.tail = NULL; // 把head和tail置空
bh_unlock_sock(sk); //解除自旋锁,对smp有效 do {
struct sk_buff *next = skb->next; skb->next = NULL;
sk_backlog_rcv(sk, skb); //调用tcp_v4_do_rcv()->tcp_rcv_established()处理数据包,
                                   //把backlog中的数据包合并到sk_receive_queue中

/*
1538 * We are in process context here with softirqs
1539 * disabled, use cond_resched_softirq() to preempt.
1540 * This is safe to do because we've taken the backlog
1541 * queue private:
1542 */
cond_resched_softirq(); skb = next; //依次处理下一个数据包
} while (skb != NULL); //遍历backlog中的数据包 bh_lock_sock(sk);
} while ((skb = sk->sk_backlog.head) != NULL); //这里纠结了好久

          /*这个外层循环这么理解:
           * __release_sock()是在进程上下文中,sk_add_backlog()是在软中断中,
           * 由于优先处理软中断,所以在__release_sock()处理数据包的时候,
           * 有可能一直有新的数据包加入backlog,导致sk->sk_backlog.head不为空,
           * 除非没有新数据包加入backlog,这个时候才立即停止外层循环。
           */

         /*
1552 * Doing the zeroing here guarantee we can not loop forever
1553 * while a wild producer attempts to flood us.
1554 */
sk->sk_backlog.len = ;
}
05-22 18:34