1. udp_rcv是封装函数,直接调用__udp4_lib_rcv函数来处理,那么我们来看看这个函数:
点击(此处)折叠或打开
- /*
- * All we need to do is get the socket, and then do a checksum.
- */
- int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
- int is_udplite)
- {
- struct sock *sk; //这个结构体很重要,它和socket结构体相关联,也就是说根据一个就可以得到另一个
- struct udphdr *uh = skb->h.uh; //从skb结构体中取得源端口号和目的端口号
- unsigned short ulen;
- struct rtable *rt = (struct rtable*)skb->dst;
- __be32 saddr = skb->nh.iph->saddr; //从skb结构体中取得源IP地址和目的IP地址
- __be32 daddr = skb->nh.iph->daddr;
- /*
- * Validate the packet.
- */
- if (!pskb_may_pull(skb, sizeof(struct udphdr)))
- goto drop; /* No space for header. */
- ulen = ntohs(uh->len);
- if (ulen > skb->len)
- goto short_packet;
- if(! is_udplite ) { /* UDP validates ulen. */
- if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
- goto short_packet;
- uh = skb->h.uh;
- udp4_csum_init(skb, uh);
- } else { /* UDP-Lite validates cscov. */
- if (udplite4_csum_init(skb, uh))
- goto csum_error;
- }
- if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) //如果是L3广播或组播报文,进入相应的处理
- return __udp4_lib_mcast_deliver(skb, uh, saddr, daddr, udptable);
- sk = __udp4_lib_lookup(saddr, uh->source, daddr, uh->dest,
- skb->dev->ifindex, udptable ); //这是这个函数所做的主要工作之一:根据目的端口号,找到应用层创建的socket
- if (sk != NULL) {
- int ret = udp_queue_rcv_skb(sk, skb); //如果找到这个socket,就把skb挂入到此socket的接收队列中
- sock_put(sk);
- /* a return value > 0 means to resubmit the input, but
- * it wants the return to be -protocol, or 0
- */
- if (ret > 0) //在这里这个数据包从网卡芯片往协议栈送的过程就算结束了
- return -ret;
- return 0;
- }
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
- goto drop;
- nf_reset(skb);
- /* No socket. Drop packet silently, if checksum is wrong */
- if (udp_lib_checksum_complete(skb))
- goto csum_error;
- UDP_INC_STATS_BH(UDP_MIB_NOPORTS, is_udplite);
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); //如果挂入失败的话,就给源主机发送目标不可达ICMP报文
- /*
- * Hmm. We got an UDP packet to a port to which we
- * don't wanna listen. Ignore it.
- */
- kfree_skb(skb); //释放掉,此skb,over...
- return(0);
- short_packet:
- LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
- is_udplite? "-Lite" : "",
- NIPQUAD(saddr),
- ntohs(uh->source),
- ulen,
- skb->len,
- NIPQUAD(daddr),
- ntohs(uh->dest));
- goto drop;
- csum_error:
- /*
- * RFC1122: OK. Discards the bad packet silently (as far as
- * the network is concerned, anyway) as per 4.1.3.4 (MUST).
- */
- LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
- is_udplite? "-Lite" : "",
- NIPQUAD(saddr),
- ntohs(uh->source),
- NIPQUAD(daddr),
- ntohs(uh->dest),
- ulen);
- drop:
- UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite);
- kfree_skb(skb);
- return(0);
- }
函数的注释,正确的归纳了这个函数的处理过程:取得相应的socket, 做一些检查。其实还应该加一句,把skb挂入socket的接收队列中。
2. 从上面的代码可以看出,此过程比较简单。分析一下根据端口号找socket的过程和将skb挂入socket接收队列的过程:
点击(此处)折叠或打开
- /* UDP is nearly always wildcards out the wazoo, it makes no sense to try
- * harder than this. -DaveM
- */
- static struct sock *__udp4_lib_lookup(__be32 saddr, __be16 sport,
- __be32 daddr, __be16 dport,
- int dif, struct hlist_head udptable[])
- {
- struct sock *sk, *result = NULL;
- struct hlist_node *node;
- unsigned short hnum = ntohs(dport);
- int badness = -1;
- read_lock(&udp_hash_lock); //这个过程得加锁
- sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { //udptable这个哈希数组在bind绑定端口号的时候已经构建好了,在这里就是用端口号来
- //从hlist链表中取得sock结构
- struct inet_sock *inet = inet_sk(sk);
- if (sk->sk_hash == hnum && !ipv6_only_sock(sk)) { //找到了bind了相同端口号的socket
- int score = (sk->sk_family == PF_INET ? 1 : 0);
- if (inet->rcv_saddr) { //在bind的时候绑定了自己本身的IP地址,判断对端发送数据包中的目的IP地址是否和自己匹配
- if (inet->rcv_saddr != daddr)
- continue;
- score+=2;
- }
- if (inet->daddr) {
- if (inet->daddr != saddr) //看socket端的目的地址和数据包的源地址
- continue;
- score+=2;
- }
- if (inet->dport) {
- if (inet->dport != sport) //看socket端的目的端口和数据包的源端口
- continue;
- score+=2;
- }
- if (sk->sk_bound_dev_if) { //看绑定的接口 ?
- if (sk->sk_bound_dev_if != dif)
- continue;
- score+=2;
- }
- if(score == 9) {
- result = sk;
- break;
- } else if(score > badness) {
- result = sk;
- badness = score;
- }
- }
- }
- if (result)
- sock_hold(result);
- read_unlock(&udp_hash_lock);
- return result;
- }
来bind相同的端口号(但是得用setsockopt设置socket属性为SO_REUSEADDR,否则会bind失败),如果这些socket属性一样(score分值一样),那么只有最后bind的socket有效,也就是说接收到的数据包会传给这个socket,其他socket接收不到skb。
点击(此处)折叠或打开
- int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
- {
- struct udp_sock *up = udp_sk(sk);
- int rc;
- /*
- * Charge it to the socket, dropping if the queue is full.
- */
- if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
- goto drop;
- nf_reset(skb);
- if (up->encap_type) {
- /*
- * This is an encapsulation socket, so let's see if this is
- * an encapsulated packet.
- * If it's a keepalive packet, then just eat it.
- * If it's an encapsulateed packet, then pass it to the
- * IPsec xfrm input and return the response
- * appropriately. Otherwise, just fall through and
- * pass this up the UDP socket.
- */
- int ret;
- ret = udp_encap_rcv(sk, skb);
- if (ret == 0) {
- /* Eat the packet .. */
- kfree_skb(skb);
- return 0;
- }
- if (ret < 0) {
- /* process the ESP packet */
- ret = xfrm4_rcv_encap(skb, up->encap_type);
- UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag);
- return -ret;
- }
- /* FALLTHROUGH -- it's a UDP Packet */
- }
- /*
- * UDP-Lite specific tests, ignored on UDP sockets
- */
- if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
- /*
- * MIB statistics other than incrementing the error count are
- * disabled for the following two types of errors: these depend
- * on the application settings, not on the functioning of the
- * protocol stack as such.
- *
- * RFC 3828 here recommends (sec 3.3): "There should also be a
- * way ... to ... at least let the receiving application block
- * delivery of packets with coverage values less than a value
- * provided by the application."
- */
- if (up->pcrlen == 0) { /* full coverage was set */
- LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage "
- "%d while full coverage %d requested\n",
- UDP_SKB_CB(skb)->cscov, skb->len);
- goto drop;
- }
- /* The next case involves violating the min. coverage requested
- * by the receiver. This is subtle: if receiver wants x and x is
- * greater than the buffersize/MTU then receiver will complain
- * that it wants x while sender emits packets of smaller size y.
- * Therefore the above ...()->partial_cov statement is essential.
- */
- if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
- LIMIT_NETDEBUG(KERN_WARNING
- "UDPLITE: coverage %d too small, need min %d\n",
- UDP_SKB_CB(skb)->cscov, up->pcrlen);
- goto drop;
- }
- }
- if (sk->sk_filter && skb->ip_summed != CHECKSUM_UNNECESSARY) {
- if (__udp_lib_checksum_complete(skb))
- goto drop;
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- }
- if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) { //在这里挂入的,这个函数里面有文章...
- /* Note that an ENOMEM error is charged twice */
- if (rc == -ENOMEM)
- UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, up->pcflag);
- goto drop;
- }
- UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag);
- return 0;
- drop:
- UDP_INC_STATS_BH(UDP_MIB_INERRORS, up->pcflag);
- kfree_skb(skb);
- return -1;
- }
点击(此处)折叠或打开
- int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
- {
- int err = 0;
- int skb_len;
- /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
- number of warnings when compiling with -W --ANK
- */
- if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= //sk_rmem_alloc是对接收的skb大小的累加和,当接收到skb时,sk_rmem_alloc增加,当从队列中取出并释放skb时,sk_rmem_alloc减少
- (unsigned)sk->sk_rcvbuf) { //sk_rcvbuf 这个是接收缓冲区的大小,我们可以通过setsockopt进行设置。我们看到当从接收队列取包的速度小于接收到包的时候,我们
- //适当增加sk_rcvbuf这个缓冲区的大小就一定程度上减少丢包。
-
- err = -ENOMEM;
- goto out;
- }
- err = sk_filter(sk, skb);
- if (err)
- goto out;
- skb->dev = NULL;
- skb_set_owner_r(skb, sk); //这个函数是对sk_rmem_alloc字段的操作
- /* Cache the SKB length before we tack it onto the receive
- * queue. Once it is added it no longer belongs to us and
- * may be freed by other threads of control pulling packets
- * from the queue.
- */
- skb_len = skb->len;
- skb_queue_tail(&sk->sk_receive_queue, skb); //把skb挂入到sk_receive_queue中
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, skb_len);
- out:
- return err;
- }
点击(此处)折叠或打开
- static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
- {
- skb->sk = sk;
- skb->destructor = sock_rfree;
- atomic_add(skb->truesize, &sk->sk_rmem_alloc); //还是个原子操作
- }
- void sock_rfree(struct sk_buff *skb) //这个是在free skb的时候调用的
- {
- struct sock *sk = skb->sk;
- atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
- }