概述

recvmsg系统调用在tcp层的实现是tcp_recvmsg函数,该函数完成从接收队列中读取数据复制到用户空间的任务;函数在执行过程中会锁定控制块,避免软中断在tcp层的影响;函数会涉及从接收队列receive_queue,预处理队列prequeue和后备队列backlog中读取数据;其中从prequeue和backlog中读取的数据,还需要经过sk_backlog_rcv回调,该回调的实现为tcp_v4_do_rcv,实际上是先缓存到队列中,然后需要读取的时候,才进入协议栈处理,此时,是在进程上下文执行的,因为会设置tp->ucopy.task=current,在协议栈处理过程中,会直接将数据复制到用户空间;

代码分析
 int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
int flags, int *addr_len)
{
struct tcp_sock *tp = tcp_sk(sk);
int copied = ;
u32 peek_seq;
u32 *seq;
unsigned long used;
int err;
int target; /* Read at least this many bytes */
long timeo;
struct task_struct *user_recv = NULL;
struct sk_buff *skb, *last;
u32 urg_hole = ; if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len); if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
(sk->sk_state == TCP_ESTABLISHED))
sk_busy_loop(sk, nonblock); /* 传输层上锁,避免软中断影响 */
lock_sock(sk); err = -ENOTCONN;
/* LISTEN状态,不允许读取数据 */
if (sk->sk_state == TCP_LISTEN)
goto out; /* 获取阻塞读取的超时时间,非阻塞为0 */
timeo = sock_rcvtimeo(sk, nonblock); /* Urgent data needs to be handled specially. */
/* 带外数据读取 */
if (flags & MSG_OOB)
goto recv_urg; /* 修复模式 */
if (unlikely(tp->repair)) {
err = -EPERM;
if (!(flags & MSG_PEEK))
goto out; if (tp->repair_queue == TCP_SEND_QUEUE)
goto recv_sndq; err = -EINVAL;
if (tp->repair_queue == TCP_NO_QUEUE)
goto out; /* 'common' recv queue MSG_PEEK-ing */
} /* 待读取的序号 */
seq = &tp->copied_seq; /* 只查看数据 */
if (flags & MSG_PEEK) {
/* 复制一个序号用于记录 */
peek_seq = tp->copied_seq;
seq = &peek_seq;
} /*
确定读取长度,设置了MSG_WAITALL则
使用用户输入的len,否则使用低潮限度
*/
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); do {
u32 offset; /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
/* 读到了带外数据 */
if (tp->urg_data && tp->urg_seq == *seq) {
/* 之前已经读取了部分数据,跳出 */
if (copied)
break;
/* 用户进程有信号待处理,跳出 */
if (signal_pending(current)) {
copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
break;
}
} /* Next get a buffer. */ /* 获取队尾 */
last = skb_peek_tail(&sk->sk_receive_queue); /* 遍历接收队列,找到满足读取的skb */
skb_queue_walk(&sk->sk_receive_queue, skb) {
last = skb;
/* Now that we have two receive queues this
* shouldn't happen.
*/
/* 队列中序号比待读取的大 */
if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
"recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
flags))
break; /* 获取序号偏移*/
offset = *seq - TCP_SKB_CB(skb)->seq; /* 有syn标记,再减1 */
if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
pr_err_once("%s: found a SYN, please report !\n", __func__);
offset--;
}
/* 偏移小于skb数据长度,找到 */
if (offset < skb->len)
goto found_ok_skb; /* 有fin标记,跳转到fin处理 */
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
WARN(!(flags & MSG_PEEK),
"recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
} /* Well, if we have backlog, try to process it now yet. */ /* 读完目标数据&& backlog队列为空 */
if (copied >= target && !sk->sk_backlog.tail)
break; /* 未读完目标数据,或者读完目标数据,队列不为空 */ /* 已经读取了数据 */
if (copied) {
/* 有错误或者关闭或者有信号,跳出 */
if (sk->sk_err ||
sk->sk_state == TCP_CLOSE ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
!timeo ||
signal_pending(current))
break;
} else {
/* 会话终结*/
if (sock_flag(sk, SOCK_DONE))
break; /* 有错误 */
if (sk->sk_err) {
copied = sock_error(sk);
break;
} /* 关闭接收端 */
if (sk->sk_shutdown & RCV_SHUTDOWN)
break; /* 连接关闭 */
if (sk->sk_state == TCP_CLOSE) {
/* 不在done状态,可能再读一个连接未建立起来的连接 */
if (!sock_flag(sk, SOCK_DONE)) {
/* This occurs when user tries to read
* from never connected socket.
*/
copied = -ENOTCONN;
break;
}
break;
} /* 不阻塞等待 */
if (!timeo) {
copied = -EAGAIN;
break;
} /* 有信号待处理 */
if (signal_pending(current)) {
copied = sock_intr_errno(timeo);
break;
}
} /* 检查是否需要发送ack */
tcp_cleanup_rbuf(sk, copied); /* 未开启低延迟&& tp的任务为空或者是当前进程 */
if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
/* Install new reader */
/* 注册当前进程任务 */
if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
user_recv = current;
tp->ucopy.task = user_recv;
tp->ucopy.msg = msg;
} /* 当前可以使用的用户缓存大小 */
tp->ucopy.len = len; WARN_ON(tp->copied_seq != tp->rcv_nxt &&
!(flags & (MSG_PEEK | MSG_TRUNC))); /* Ugly... If prequeue is not empty, we have to
* process it before releasing socket, otherwise
* order will be broken at second iteration.
* More elegant solution is required!!!
*
* Look: we have the following (pseudo)queues:
*
* 1. packets in flight
* 2. backlog
* 3. prequeue
* 4. receive_queue
*
* Each queue can be processed only if the next ones
* are empty. At this point we have empty receive_queue.
* But prequeue _can_ be not empty after 2nd iteration,
* when we jumped to start of loop because backlog
* processing added something to receive_queue.
* We cannot release_sock(), because backlog contains
* packets arrived _after_ prequeued ones.
*
* Shortly, algorithm is clear --- to process all
* the queues in order. We could make it more directly,
* requeueing packets from backlog to prequeue, if
* is not empty. It is more elegant, but eats cycles,
* unfortunately.
*/
/* prequeue不为空,处理prequeue */
if (!skb_queue_empty(&tp->ucopy.prequeue))
goto do_prequeue; /* __ Set realtime policy in scheduler __ */
} /* 目标数据读取完,处理后备队列 */
if (copied >= target) {
/* Do not sleep, just process backlog. */
release_sock(sk);
lock_sock(sk);
}
/* 未读取完,进入等待 */
else {
sk_wait_data(sk, &timeo, last);
} /* 用户空间接收数据 */
if (user_recv) {
int chunk; /* __ Restore normal policy in scheduler __ */ /* 获取读取长度 */
chunk = len - tp->ucopy.len; /* 记录剩余读取长度和已经读取长度 */
if (chunk != ) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
len -= chunk;
copied += chunk;
} /*
接收到的数据已经全部复制到用户空间
&& prequeue不为空
*/
if (tp->rcv_nxt == tp->copied_seq &&
!skb_queue_empty(&tp->ucopy.prequeue)) {
do_prequeue:
/* 处理prequeue */
tcp_prequeue_process(sk); /* 获取读取长度和剩余长度 */
chunk = len - tp->ucopy.len;
if (chunk != ) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
}
}
} /* 只是查看数据,则更新peek_seq */
if ((flags & MSG_PEEK) &&
(peek_seq - copied - urg_hole != tp->copied_seq)) {
net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
current->comm,
task_pid_nr(current));
peek_seq = tp->copied_seq;
}
continue; /* 读取一个找到的合适的段 */
found_ok_skb:
/* Ok so how much can we use? */ /* 获取该skb中可读的数据长度 */
used = skb->len - offset; /* 不需要读取那么多,则调整为需要的长度 */
if (len < used)
used = len; /* Do we have urgent data here? */
/* 有带外数据*/
if (tp->urg_data) {
/* 带外数据偏移 */
u32 urg_offset = tp->urg_seq - *seq; /* 偏移在我们要读取的数据范围内 */
if (urg_offset < used) {
/* 当前正在读取的数据为带外数据 */
if (!urg_offset) {
/* 不允许放入正常数据流 */
if (!sock_flag(sk, SOCK_URGINLINE)) {
/* 调整序号和偏移 */
++*seq;
urg_hole++;
offset++;
used--;
/* 无可读数据 */
if (!used)
goto skip_copy;
}
}
/* 本次只能读到带外数据为止 */
else
used = urg_offset;
}
} /* 读取数据 */
if (!(flags & MSG_TRUNC)) {
err = skb_copy_datagram_msg(skb, offset, msg, used);
if (err) {
/* Exception. Bailout! */
if (!copied)
copied = -EFAULT;
break;
}
} /* 计算读取和待读取数据长度 */
*seq += used;
copied += used;
len -= used; tcp_rcv_space_adjust(sk); skip_copy:
/* 完成对带外数据的处理 */
if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
/* 标志清零 */
tp->urg_data = ;
/* 快路检查 */
tcp_fast_path_check(sk);
} /* 满足继续读取 */
if (used + offset < skb->len)
continue;
/* fin处理 */
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok; /* 数据读取完,不是查看,则释放该skb */
if (!(flags & MSG_PEEK))
sk_eat_skb(sk, skb);
continue; found_fin_ok:
/* Process the FIN. */
/* 序号增加 */
++*seq;
/* 不是查看,则释放skb */
if (!(flags & MSG_PEEK))
sk_eat_skb(sk, skb);
break;
} while (len > ); /* 用户空间进程接收数据 */
if (user_recv) {
/* prequeue不为空 */
if (!skb_queue_empty(&tp->ucopy.prequeue)) {
int chunk; /* 调整剩余可用空间 */
tp->ucopy.len = copied > ? len : ; /* 处理prequeue */
tcp_prequeue_process(sk); /* 读取了数据,则重新计算下长度 */
if (copied > && (chunk = len - tp->ucopy.len) != ) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
}
} /* 用户空间结束读取 */
tp->ucopy.task = NULL;
tp->ucopy.len = ;
} /* According to UNIX98, msg_name/msg_namelen are ignored
* on connected socket. I was just happy when found this 8) --ANK
*/ /* Clean up data we have read: This will do ACK frames. */
/* 检查是否有ack发送 */
tcp_cleanup_rbuf(sk, copied); release_sock(sk);
return copied; out:
release_sock(sk);
return err; recv_urg:
/* 带外数据 */
err = tcp_recv_urg(sk, msg, len, flags);
goto out; recv_sndq:
err = tcp_peek_sndq(sk, msg, len);
goto out;
}
05-28 15:11