转载请注明出处:http://blog.chinaunix.net/uid-20788636-id-4420258.html2.3  inet_csk_get_port函数在inet_connection_sock.c文件中的inet_csk_get_port函数分析。int inet_csk_get_port(structsock *sk, unsigned short snum){         structinet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;/*TCP散列表管理结构实例tcp_hashinfo,在tcp.c文件中tcp_init函数中进行了初始化工作,在tcp_ipv4.c文件中,struct proto tcp_prot结构体对其进行赋值 .h.hashinfo                 =&tcp_hashinfo,*/         structinet_bind_hashbucket *head;         structinet_bind_bucket *tb;         intret, attempts = 5;         structnet *net = sock_net(sk);         intsmallest_size = -1, smallest_rover;         kuid_tuid = sock_i_uid(sk);//运行经常的用户ID          local_bh_disable();         if(!snum) {//如果用户绑定的端口为0,就选择一个可用的本地端口                   intremaining, rover, low, high; again:                   inet_get_local_port_range(net,&low, &high); //获取到本地可以使用的端口范围--(1)                   remaining= (high - low) + 1; //最大重新分配的次数                   smallest_rover= rover = prandom_u32() % remaining + low; //随机生成的端口号,赋值给rover.                    smallest_size= -1;//下面的while循环代码是根据获取到的空闲的端口号和bhash_size从bhash上取得HASH值对应的链表,然后遍历链表,对比链表中是否有获取到的空闲端口号,如果有该端口号,说明获取的该端口号已经被占用,如果已经被占用则将获取的端口号加一,如果大于最大值,则从最小值开始重新遍历端口列表,直到尝试成功的次数为remaining.                   do{                            if(inet_is_reserved_local_port(rover))//如果是保留端口直接寻找下一个接口                                     gotonext_nolock;                            head= &hashinfo->bhash[inet_bhashfn(net, rover,                                               hashinfo->bhash_size)];                            spin_lock(&head->lock);                            inet_bind_bucket_for_each(tb,&head->chain)                                     if(net_eq(ib_net(tb), net) && tb->port == rover) {/*下面的这段代码的判断就是判断端口是否可以被复用,如果可以被复用即使在绑定表中,也优先使用可以复用的端口*/                                              if(((tb->fastreuse > 0 &&                                                     sk->sk_reuse &&                                                     sk->sk_state != TCP_LISTEN) ||                                                    (tb->fastreuseport > 0 &&                                                     sk->sk_reuseport &&                                                     uid_eq(tb->fastuid, uid))) &&                                                   (tb->num_owners {                                                        smallest_size= tb->num_owners;//记下端口使用者的个数                                                        smallest_rover= rover;/*如果绑定端口的个数大于端口的可用个数,就会判断是否有绑定冲突*/                                                        if(atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&                                                           !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {                                                                 snum= smallest_rover;//如果没有绑定冲突使用该端口 调用inet_csk_bind_conflict                                                                 gototb_found; //跳转到找到该端口处理                                                        }                                               }//检查端口绑定是否有冲突,如果没有冲突就使用该端口inet_csk_bind_conflict                                               if(!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {                                                        snum= rover;                                                        gototb_found; //跳转到找到该端口处理                                               }                                               gotonext;//此端口在绑定表中,但是不能复用,寻找下一个                                     }                            break;//如果不在绑定表中,则该端口可以使用,直接跳出循环                   next:                            spin_unlock(&head->lock);                   next_nolock:                            if(++rover > high)//如果找到的端口号,大于端口的上限值,则把最小端口赋值给rover                                     rover= low;                   }while (--remaining > 0);                    /*Exhausted local port range during search? It is not                    * possible for us to be holding one of thebind hash                    * locks if this test triggers, because if'remaining'                    * drops to zero, we broke out of the do/whileloop at                    * the top level, not from the 'break;'statement.                    */                   ret= 1;                   if(remaining //这里主要是在没有查到的情况下,再给次最后一次机会                            if(smallest_size != -1) {                                     snum= smallest_rover;                                     gotohave_snum;                            }                            gotofail;                   }                   /*OK, here is the one we will use.  HEAD is                    * non-NULL and we hold it's mutex.                    */                   snum= rover;//找到绑定的端口号         }else {//如果指定端口号,则在相应的绑定链表中进行查询。have_snum:                   head= &hashinfo->bhash[inet_bhashfn(net, snum,                                     hashinfo->bhash_size)];                   spin_lock(&head->lock);                   inet_bind_bucket_for_each(tb,&head->chain)                            if(net_eq(ib_net(tb), net) && tb->port == snum)                                     gototb_found;//在绑定表中查找,表示该端口已经绑定         }         tb= NULL;//如果指定的端口在绑定表中没有发现,直接创建         gototb_not_found;tb_found:         if(!hlist_empty(&tb->owners)) {//该端口绑定Socket                   if(sk->sk_reuse == SK_FORCE_REUSE)                            gotosuccess;//如果该Socket设置了SK_FORCE_REUSE,表示可以强制复用                    if(((tb->fastreuse > 0 &&                         sk->sk_reuse &&sk->sk_state != TCP_LISTEN) ||                        (tb->fastreuseport > 0 &&                        sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&                       smallest_size == -1) {/*判断端口是否可以复用,其中fastreuseport是Google添加的一个SOCKET bind选项信息,tb 配置启用了 reuseport,并且当前 socket 也设置 了reuseport,且 tb 和当前 socket 的 UID 一样,可以认为当前 socket 也可以放到 bind hash 中,随后会调用 inet_bind_hash 将当前 sock 也加入到 tb->owners 链表中*/                            gotosuccess;                   }else {//如果是指定端口的话,else应该不会执行                            ret= 1;                            if(inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {//如果绑定冲突,进行5次尝试查找端口号 attempts = 5;由于在查找时进行了类似的判断,该判断条件基本不会成立,直接执行tb_not_found点,这时tb不为空                                     if(((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||                                          (tb->fastreuseport > 0 &&                                           sk->sk_reuseport &&uid_eq(tb->fastuid, uid))) &&                                         smallest_size != -1 && --attempts>= 0) {                                               spin_unlock(&head->lock);                                               gotoagain;                                     }                                      gotofail_unlock;                            }                   }         }tb_not_found:         ret= 1;//如果在绑定表中没有发现,则创建         if(!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,                                               net,head, snum)) == NULL)                   gotofail_unlock;         if(hlist_empty(&tb->owners)) {//如果没有绑定Socket,                   if(sk->sk_reuse && sk->sk_state != TCP_LISTEN)                            tb->fastreuse= 1;                   else                            tb->fastreuse= 0;                   if(sk->sk_reuseport) {//如果Socket设定了SO_REUSEPORT选项,就对fastreuseport进行赋值1,                            tb->fastreuseport= 1;                            tb->fastuid= uid;//创建当前fd的的UID                   }else                            tb->fastreuseport= 0;         }else {//如果绑定了Socket                   if(tb->fastreuse &&                       (!sk->sk_reuse || sk->sk_state ==TCP_LISTEN))                            tb->fastreuse= 0;                   if(tb->fastreuseport &&                       (!sk->sk_reuseport ||!uid_eq(tb->fastuid, uid)))                            tb->fastreuseport= 0;         }success://如果成功找到一个可用的端口。添加到绑定表中         if(!inet_csk(sk)->icsk_bind_hash)                   inet_bind_hash(sk,tb, snum);//把当前的sock插入到owers中,         WARN_ON(inet_csk(sk)->icsk_bind_hash!= tb);         ret= 0; fail_unlock:         spin_unlock(&head->lock);fail:         local_bh_enable();         returnret;}2.3.1  inet_get_local_port_range()inet_get_local_port_range()获取本地可用端口的范围,从下面的定义可以知道端口的范围为32768—61000。如果用户空间绑定的本地端口为0的话,会自动为套接口分配一个可用的端口。/* *This struct holds the first and last local port number. */struct local_ports sysctl_local_ports__read_mostly = {         .lock= SEQLOCK_UNLOCKED,         .range= { 32768, 61000 },}; void inet_get_local_port_range(int *low,int *high){         unsignedseq;         do{                   seq= read_seqbegin(&sysctl_local_ports.lock);                    *low= sysctl_local_ports.range[0];                   *high= sysctl_local_ports.range[1];         }while (read_seqretry(&sysctl_local_ports.lock, seq));}2.3.2 本地端口可以被复用的条件本地端口可以被复用的几个条件如下:          = 1 \* alphabetic a) 如果Socket绑定在不同的接口上,可以共享同一个本地端口。          = 2 \* alphabetic b)如果sockets设置了sk->sk_reuse,并且这些Sockets的状态都不是TCP_LISTEN,端口可以被复用 = 3 \* alphabetic c) 如果Scokets绑定了特定的inet_sk(sk)->rcv_saddr 本地地址,并且这样地址不相同,端口可以被复用。如果不满足上面三个条件之一,则端口不能够被复用。2.3.3  inet_bind_bucket 结构体struct inet_bind_bucket {#ifdef CONFIG_NET_NS         structnet                   *ib_net;#endif         unsignedshort                   port;//端口号         signedchar                fastreuse;//地址复用SO_REUSEADDR         signedchar                fastreuseport;//端口号复用         kuid_t                          fastuid;//进程的用户ID         int                       num_owners;//端口使用者的个数         structhlist_node     node;//指向下一个端口的inet_bind_bucket         structhlist_head     owners;//使用这个端口的Scoket链表};2.3.4  inet_csk_bind_conflict函数inet_csk_bind_conflict检查端口是否冲突,返回0表示可以绑定,不冲突,返回1表示无法绑定该端口号int inet_csk_bind_conflict(const structsock *sk,                               const struct inet_bind_bucket *tb, boolrelax){         structsock *sk2;         intreuse = sk->sk_reuse;// SO_REUSEADDR         intreuseport = sk->sk_reuseport;         kuid_tuid = sock_i_uid((struct sock *)sk);          /*          * Unlike other sk lookup places we do notcheck          * for sk_net here, since _all_ the sockslisted          * in tb->owners list belong to the same net- the          * one this bucket belongs to.          *//* 在tb->owners链表中循环检查绑定该端口的Socket,确定该端口是否冲突*/         sk_for_each_bound(sk2,&tb->owners) {/*这里的判断看是否冲突:第一If是判断:不是同一个socket,并且没有绑定设备,或者绑定的设备为相同*/                   if(sk != sk2 &&                       !inet_v6_ipv6only(sk2) &&                       (!sk->sk_bound_dev_if ||                        !sk2->sk_bound_dev_if ||                        sk->sk_bound_dev_if ==sk2->sk_bound_dev_if)) {/*满足下面的条件之一a为真:1)绑定的socket不能复用;2)查找到的socket不能复用;3)查找到的socket处于监听状态;满足下面的条件之一b为真:1)要绑定的Socket端口不能复用;2)查找到的Socket的端口不允许复用 3)在链表中查找到的socket的状态为TCP_TIME_WAIT,并且两个socket的用户ID不相等如果a和b都为真,再判断绑定的IP地址是否相同*/                            if ((!reuse || !sk2->sk_reuse ||                                sk2->sk_state == TCP_LISTEN) &&                                (!reuseport || !sk2->sk_reuseport ||                                (sk2->sk_state != TCP_TIME_WAIT&&                                 !uid_eq(uid, sock_i_uid(sk2))))) {      /*绑定的相同的IP上*/                                     if(!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||                                         sk2->sk_rcv_saddr ==sk->sk_rcv_saddr)                                               break;//检查到冲突                            }/*如果relex为False,就不需要判断端口号是否可以复用,只判断地址是否可以复用*/                            if(!relax && reuse && sk2->sk_reuse &&                                sk2->sk_state != TCP_LISTEN) {                                      if(!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||                                         sk2->sk_rcv_saddr ==sk->sk_rcv_saddr)                                               break;                            }                   }         }         returnsk2 != NULL;//sk2不等于空,说明有冲突}2.3.5 inet_bind_bucket_create函数inet_bind_bucket_create函数分配一个inet_bind_bucket结构体实例并进行初始化操作,然后绑定到已绑定端口的散列表中struct inet_bind_bucket *inet_bind_bucket_create(structkmem_cache *cachep,                                                         struct net *net,                                                         struct inet_bind_hashbucket *head,                                                         const unsigned short snum){         structinet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);          if(tb != NULL) {                   write_pnet(&tb->ib_net,hold_net(net));                   tb->port      = snum;                   tb->fastreuse= 0;                   tb->fastreuseport= 0;                   tb->num_owners= 0;//这些初始化,会根据配置的socket参数进行修改                   INIT_HLIST_HEAD(&tb->owners);                   hlist_add_head(&tb->node,&head->chain);         }         returntb;}2.3.6  inet_bind_hash 函数inet_bind_hash 函数更新变量void inet_bind_hash(struct sock *sk, structinet_bind_bucket *tb,                       const unsigned short snum){         structinet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;//TCP散列表管理结构实例TCP_hashinfo          atomic_inc(&hashinfo->bsockets);//绑定次数加1          inet_sk(sk)->inet_num= snum;//端口号赋值         sk_add_bind_node(sk,&tb->owners);//把Socket加入到tb->owners的hash表中         tb->num_owners++;//端口的绑定次数增加         inet_csk(sk)->icsk_bind_hash= tb;}2.3.7 数据结构之间的关系各种数据结构之间的关系 2.3.8  inet_hashinfo 结构体struct inet_hashinfo {         /*This is for sockets with full identity only. Sockets here will          * always be without wildcards and will havethe following invariant:          *          *         TCP_ESTABLISHED sk_state          *          */         structinet_ehash_bucket        *ehash;         spinlock_t                            *ehash_locks;         unsignedint                        ehash_mask;         unsignedint                        ehash_locks_mask;          /*Ok, let's try this, I give up, we do need a local binding          * TCP hash as well as the others for fastbind/connect.          */         structinet_bind_hashbucket  *bhash;          unsignedint                        bhash_size;         /*4 bytes hole on 64 bit */          structkmem_cache                   *bind_bucket_cachep;          /*All the above members are written once at bootup and          * never written again _or_ are predominantlyread-access.          *          * Now align to a new cache line as all thefollowing members          * might be often dirty.          */         /*All sockets in TCP_LISTEN state will be in here.  This is the only          * table where wildcard'd TCP sockets canexist.  Hash function here          * is just local port number.          */         structinet_listen_hashbucket         listening_hash[INET_LHTABLE_SIZE]                                               ____cacheline_aligned_in_smp;          atomic_t                    bsockets;};tcp表分成了三张表ehash,bhash, listening_hash,其中ehash, listening_hash对应于socket处在TCP的ESTABLISHED, LISTEN状态,bhash对应于socket已绑定了本地地址。2.4 流程和总结(1)bind主要的主要是选择一个可用的端口号,如果用户没有指定端口号,则会按照一定的规则进行选择一个可用的端口号。 附录:    对于Google REUSEPORT 新特性,支持多个进程或者线程绑定到相同的 IP 和端口,以提高 server 的性能。    该特性实现了 IPv4/IPv6 下 TCP/UDP 协议的支持, 已经集成到 kernel 3.9 中。核心的实现主要有三点:(1)扩展 socket option,增加 SO_REUSEPORT 选项,用来设置 reuseport。(2)修改 bind 系统调用实现,以便支持可以绑定到相同的 IP 和端口(3)修改处理新建连接的实现,查找 listener 的时候,能够支持在监听相同 IP 和端口的多个 sock 之间均衡选择。请参考:http://blog.chinaunix.net/uid-10167808-id-3807060.html参考资料:http://blog.csdn.net/zhangskd/article/details/13631715http://tsecer.blog.163.com/blog/static/1501817201281211321031
10-15 14:43
查看更多