前言:       对于Linux内核的Socket系列文章都是依据于:Linux-3.14.5的版本内核分析,对于文中的注释和问题的说明也参考了网络上经典分析文章,对他们奉献表示感谢!     转载请标明:http://blog.chinaunix.net/uid-20788636-id-4408261.html1. Socket内核调用数SYSCALL_DEFINE3Socket的创建是在用户空间调用socket系统函数完成的,创建一个Socket返回一个文件描述符fd,内核的系统调用接口为SYSCALL_DEFINE3(socket,int, family, int, type, int, protocol),在net/socket.c文件中,下面我们看一下内核中的源码实现。SYSCALL_DEFINE3(socket, int, family, int,type, int, protocol){         intretval;         structsocket *sock;         intflags;          /*Check the SOCK_* constants for consistency.  下面这些都是进行各种的检查操作*/         BUILD_BUG_ON(SOCK_CLOEXEC!= O_CLOEXEC);         BUILD_BUG_ON((SOCK_MAX| SOCK_TYPE_MASK) != SOCK_TYPE_MASK);         BUILD_BUG_ON(SOCK_CLOEXEC& SOCK_TYPE_MASK);         BUILD_BUG_ON(SOCK_NONBLOCK& SOCK_TYPE_MASK);          flags= type & ~SOCK_TYPE_MASK;         if(flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))                   return-EINVAL;         type&= SOCK_TYPE_MASK;           if(SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))                   flags= (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;//调用创建socket的函数         retval= sock_create(family, type, protocol, &sock);//------参考下面的分析         if(retval                   gotoout;          retval= sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));         if(retval                   gotoout_release; out:         /*It may be already another descriptor 8) Not kernel problem. */         returnretval; out_release:         sock_release(sock);         returnretval;}1.1  socket_create函数对于sock_create(family,type, protocol, &sock)函数调用的是包囊函数,__sock_create(current->nsproxy->net_ns,family, type, protocol, res, 0);对于__sock_create函数的定义如下:int __sock_create(struct net *net, intfamily, int type, int protocol,                             struct socket **res, int kern){         interr;         structsocket *sock;         conststruct net_proto_family *pf;         /*          *     Check protocol is in range 检查协议的范围,现在内核定义的最大范围为41,这里的family指的是AF_INET6,AF_INET协议簇       #define NPROTO                  AF_MAX #define AF_MAX           41     /* For now.. */          */         if(family = NPROTO)                   return-EAFNOSUPPORT;         if(type = SOCK_MAX)//这里的type是socket的类型例如SOCK_STREAM                   return-EINVAL;         /*Compatibility.            This uglymoron is moved from INET layer tohere to avoid            deadlock in module load.          */         if(family == PF_INET && type == SOCK_PACKET) {//如果是该类型的socket,对family进行重新的赋值                   staticint warned;//这里自动初始化为0,                   if(!warned) {                            warned= 1;                            printk(KERN_INFO"%s uses obsolete (PF_INET,SOCK_PACKET)\n",                                   current->comm);                   }                   family= PF_PACKET;//赋值为PF_PACKET         }          err= security_socket_create(family, type, protocol, kern);         if(err)                   returnerr;          /*          *     Allocatethe socket and allow the family to set things up. if          *     theprotocol is 0, the family is instructed to select an appropriate          *     default.这里调用sock_alloc分配sock,见下面的分析          */         sock = sock_alloc();         if(!sock) {                   net_warn_ratelimited("socket:no more sockets\n");                   return-ENFILE;         /* Not exactly a match,but its the                                        closest posix thing */         }          sock->type= type; #ifdef CONFIG_MODULES         /*Attempt to load a protocol module if the find failed.          *          * 12/09/1996 Marcin: But! this makes REALLYonly sense, if the user          * requested real, full-featured networkingsupport upon configuration.          * Otherwise module support will break!          */         if(rcu_access_pointer(net_families[family]) == NULL)                   request_module("net-pf-%d",family);#endif          rcu_read_lock();         pf= rcu_dereference(net_families[family]);         err= -EAFNOSUPPORT;         if(!pf)                   gotoout_release;          /*          * We will call the ->create function, thatpossibly is in a loadable          * module, so we have to bump that loadablemodule refcnt first.          */         if(!try_module_get(pf->owner))                   gotoout_release;          /*Now protected by module ref count */         rcu_read_unlock();/*static const structnet_proto_family inet_family_ops = {         .family = PF_INET,         .create = inet_create,         .owner     =THIS_MODULE,};这里根据注册的family类型,调用不同的create函数,这里就是调用inet_ctreate*/         err = pf->create(net, sock, protocol, kern);         if(err                   gotoout_module_put;          /*          * Now to bump the refcnt of the [loadable]module that owns this          * socket at sock_release time we decrement itsrefcnt.          */         if(!try_module_get(sock->ops->owner))                   gotoout_module_busy;          /*          * Now that we're done with the ->createfunction, the [loadable]          * module can have its refcnt decremented          */         module_put(pf->owner);         err= security_socket_post_create(sock, family, type, protocol, kern);         if(err)                   gotoout_sock_release;         *res= sock;          return0;out_module_busy:         err= -EAFNOSUPPORT;out_module_put:         sock->ops= NULL;         module_put(pf->owner);out_sock_release:         sock_release(sock);         returnerr; out_release:         rcu_read_unlock();         gotoout_sock_release;}1.1.1   sock_alloc函数sock_alloc函数用于分配一个socket结构体,这这里涉及了inode结构以及在分配完成后返回的地址指针。static struct socket *sock_alloc(void){         structinode *inode;         structsocket *sock;   /*下面的new_inode_pseudo函数是分配一个新的inode结构体,但在实际分配过程中,分配了一个socket_alloc结构体,返回d的是inode地址,struct socket_alloc {         struct socket socket;         struct inode vfs_inode;};*/         inode= new_inode_pseudo(sock_mnt->mnt_sb);//sock_mnt哪里进行初始的,请看下面的分析-----(1)         if(!inode)                   returnNULL;         sock= SOCKET_I(inode);//该宏根据返回的inode获取到分配的socket_alloc指针         kmemcheck_annotate_bitfield(sock,type);   /*下面是对inode变量进行初始化操作,*/         inode->i_ino= get_next_ino();         inode->i_mode= S_IFSOCK | S_IRWXUGO;         inode->i_uid= current_fsuid();//用户ID,在后面调用bind系统调用时会进行对比         inode->i_gid= current_fsgid();//组ID         inode->i_op= &sockfs_inode_ops;          this_cpu_add(sockets_in_use,1);         returnsock;}(1)对于sock_mnt->mnt_sb的赋值和分配过程如下:在sock_init函数中对socket类型的文件系统进行注册static struct file_system_type sock_fs_type = {         .name=             "sockfs",         .mount=  sockfs_mount,         .kill_sb=  kill_anon_super,};static int __init sock_init(void){         interr;         /*          *     Initialize the network sysctl infrastructure.          */         err= net_sysctl_init();         if(err)                   gotoout;          /*          *     Initialize skbuff SLAB cache          */         skb_init();          /*          *     Initialize the protocols module.          */          init_inodecache();  /*下面的函数进行文件系统的注册*/         err = register_filesystem(&sock_fs_type);         if(err)                   gotoout_fs;/*下面的函数挂载文件系统*/         sock_mnt = kern_mount(&sock_fs_type);         if(IS_ERR(sock_mnt)) {                   err= PTR_ERR(sock_mnt);                   gotoout_mount;         }          /*The real protocol initialization is performed in later initcalls.          */ #ifdef CONFIG_NETFILTER         err= netfilter_init();         if(err)                   gotoout;#endif #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING         skb_timestamping_init();#endif out:         returnerr; out_mount:         unregister_filesystem(&sock_fs_type);out_fs:         gotoout;} (2)new_inode_pseudo函数创建一个inode,并初始化inode的i_state变量和inode->i_sb_list链表,实际的分配函数为alloc_inode函数struct inode *new_inode_pseudo(structsuper_block *sb){         struct inode *inode = alloc_inode(sb);          if(inode) {                   spin_lock(&inode->i_lock);                   inode->i_state = 0;                   spin_unlock(&inode->i_lock);                   INIT_LIST_HEAD(&inode->i_sb_list);         }         returninode;}         alloc_inode分配一个inode节点,static struct inode *alloc_inode(structsuper_block *sb){         structinode *inode;          if(sb->s_op->alloc_inode)/*如果当前文件系统的超级块,有自己的分配inode的函数,则调用它自己的分配函数,否则从公用的高速缓存中分配一个inode.对于sokcet来说,在socket.c文件中,调用的函数为sock_alloc_inodestatic const structsuper_operations sockfs_ops = {         .alloc_inode     =sock_alloc_inode,         .destroy_inode         =sock_destroy_inode,         .statfs                =simple_statfs,};*/                   inode = sb->s_op->alloc_inode(sb);         else                   inode= kmem_cache_alloc(inode_cachep, GFP_KERNEL);          if(!inode)                   returnNULL;/*对inode结构进行初始化*/         if(unlikely(inode_init_always(sb, inode))) {                   if(inode->i_sb->s_op->destroy_inode)                            inode->i_sb->s_op->destroy_inode(inode);                   else                            kmem_cache_free(inode_cachep,inode);                   returnNULL;         }          returninode;}         (3) 下面是sock_alloc_inode函数,在socket.c文件中static struct inode*sock_alloc_inode(struct super_block *sb){         struct socket_alloc *ei;         structsocket_wq *wq;   /*下面的函数分配struct socket_alloc结构体,这里是怎么分配的呢?参考下面的说明 */         ei= kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);         if(!ei)                   returnNULL;         wq= kmalloc(sizeof(*wq), GFP_KERNEL);         if(!wq) {                   kmem_cache_free(sock_inode_cachep,ei);                   returnNULL;         }         init_waitqueue_head(&wq->wait);         wq->fasync_list= NULL;         RCU_INIT_POINTER(ei->socket.wq,wq);          ei->socket.state= SS_UNCONNECTED;         ei->socket.flags= 0;         ei->socket.ops= NULL;         ei->socket.sk= NULL;         ei->socket.file= NULL;          return&ei->vfs_inode; //这里返回的是struct inode vfs_inode;}  备注说明:在分配函数sock_alloc_inode中调用了ei =kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);这里分配的大小为socket_alloc大小,下面分析一下是如果分配该大小的?init_inodecache函数中(net/socket.c),对其进行了高速缓存的分配操作,定义在socket.c文件中,这里分配的大小为socket_alloc,但返回是的socket_alloc结构体中的struct inode vfs_inode;变量。该函数在sock_init函数中被调用static int init_inodecache(void){         sock_inode_cachep =kmem_cache_create("sock_inode_cache",                                                     sizeof(struct socket_alloc),                                                     0,                                                     (SLAB_HWCACHE_ALIGN |                                                      SLAB_RECLAIM_ACCOUNT |                                                      SLAB_MEM_SPREAD),                                                     init_once);         if(sock_inode_cachep == NULL)                   return-ENOMEM;         return0;}1.1.2   inet_create函数         在socket_create函数中调用pf->create这里的函数指针为inet_create。在文件/net/af_inet.c中static int inet_create(struct net *net,struct socket *sock, int protocol,                          int kern){         structsock *sk;         structinet_protosw *answer;         structinet_sock *inet;         structproto *answer_prot;         unsignedchar answer_flags;         charanswer_no_check;         inttry_loading_module = 0;         interr;          sock->state= SS_UNCONNECTED;          /*Look for the requested type/protocol pair. */lookup_protocol:         err= -ESOCKTNOSUPPORT;         rcu_read_lock();/*  从inetsw中根据类型、协议查找相应的socket interface也就是 inet_protosw */         list_for_each_entry_rcu(answer,&inetsw[sock->type], list) {                    err= 0;                   /*Check the non-wild match. */                   if(protocol == answer->protocol) {                            if(protocol != IPPROTO_IP)                                     break;                   }else {                            /*Check for the two wild cases. */                            if(IPPROTO_IP == protocol) {                                     protocol= answer->protocol;                                     break;                            }                            if(IPPROTO_IP == answer->protocol)                                     break;                   }                   err= -EPROTONOSUPPORT;         }/*如果没有找到,尝试加载模块*/         if(unlikely(err)) {                   if(try_loading_module                            rcu_read_unlock();                            /*                             * Be more specific, e.g.net-pf-2-proto-132-type-1                             *(net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)                             */                            if (++try_loading_module ==1)                                     request_module("net-pf-%d-proto-%d-type-%d",                                                      PF_INET, protocol, sock->type);                            /*                             * Fall back to generic, e.g.net-pf-2-proto-132                             * (net-pf-PF_INET-proto-IPPROTO_SCTP)                             */                            else                                     request_module("net-pf-%d-proto-%d",                                                      PF_INET, protocol);                            gotolookup_protocol;                   }else                            gotoout_rcu_unlock;         }          err= -EPERM;         if(sock->type == SOCK_RAW && !kern &&             !ns_capable(net->user_ns, CAP_NET_RAW))                   gotoout_rcu_unlock;          sock->ops= answer->ops;         answer_prot= answer->prot;         answer_no_check= answer->no_check;         answer_flags= answer->flags;         rcu_read_unlock();          WARN_ON(answer_prot->slab== NULL);/* sk_alloc表面上是生成一个sock的结构体,但是实际上对于tcp来说是一个tcp_sock的大小的结构体,这样就可以使用inet_sk(sk);进行强制的类型转换,具体是怎么分配的是tcp_sock大小的,在后续进行分析*/         err= -ENOBUFS;         sk= sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);         if(sk == NULL)                   gotoout;          err= 0;         sk->sk_no_check= answer_no_check;         if(INET_PROTOSW_REUSE & answer_flags)                   sk->sk_reuse= SK_CAN_REUSE;          inet= inet_sk(sk);         inet->is_icsk= (INET_PROTOSW_ICSK & answer_flags) != 0;          inet->nodefrag= 0;          if(SOCK_RAW == sock->type) {                   inet->inet_num= protocol;                   if(IPPROTO_RAW == protocol)                            inet->hdrincl= 1;         }          if(net->ipv4.sysctl_ip_no_pmtu_disc)                   inet->pmtudisc= IP_PMTUDISC_DONT;         else                   inet->pmtudisc= IP_PMTUDISC_WANT;          inet->inet_id= 0;    /*对sk结构体中的变量进行初始化操作,*/         sock_init_data(sock, sk);------------------(1)          sk->sk_destruct         = inet_sock_destruct;         sk->sk_protocol          = protocol;         sk->sk_backlog_rcv= sk->sk_prot->backlog_rcv;          inet->uc_ttl      = -1;         inet->mc_loop = 1;         inet->mc_ttl     = 1;         inet->mc_all     = 1;         inet->mc_index        = 0;         inet->mc_list   = NULL;         inet->rcv_tos   = 0;          sk_refcnt_debug_inc(sk);          if(inet->inet_num) {                   /*It assumes that any protocol which allows                    * the user to assign a number at socket                    * creation time automatically                    * shares.                    */                   inet->inet_sport= htons(inet->inet_num);                   /*Add to protocol hash chains. */                   sk->sk_prot->hash(sk);         }          if(sk->sk_prot->init) {                   err= sk->sk_prot->init(sk);//如果是tcp的话,这里就是tcp_v4_init_sock--------(2)                   if(err)                            sk_common_release(sk);         }out:         returnerr;out_rcu_unlock:         rcu_read_unlock();         gotoout;} (1)sock_init_data函数分析void sock_init_data(struct socket *sock,struct sock *sk){         skb_queue_head_init(&sk->sk_receive_queue);         skb_queue_head_init(&sk->sk_write_queue);         skb_queue_head_init(&sk->sk_error_queue);#ifdef CONFIG_NET_DMA         skb_queue_head_init(&sk->sk_async_wait_queue);#endif          sk->sk_send_head   =       NULL;   /*初始化sk定时器*/         init_timer(&sk->sk_timer);          sk->sk_allocation     =       GFP_KERNEL;         sk->sk_rcvbuf            =       sysctl_rmem_default;         sk->sk_sndbuf           =       sysctl_wmem_default;         sk->sk_state             =       TCP_CLOSE; //初始化sk_state=TCP_CLOSE状态,在后面是的系统调用中会进行判断         sk_set_socket(sk,sock);// sk->sk_socket = sock; 设置sk中指向socket的指针          sock_set_flag(sk,SOCK_ZAPPED);//设置SOKCET的flag位,表明该sokcet已经绑定了一个名字,该标志位没有搞明白什么意思?         if(sock) {                   sk->sk_type      =       sock->type;                   sk->sk_wq        =       sock->wq;                   sock->sk   =       sk; // struct socket *sock 的sk指向sock         }else                   sk->sk_wq        =       NULL;          spin_lock_init(&sk->sk_dst_lock);         rwlock_init(&sk->sk_callback_lock);         lockdep_set_class_and_name(&sk->sk_callback_lock,                            af_callback_keys+ sk->sk_family,                            af_family_clock_key_strings[sk->sk_family]);          sk->sk_state_change       =       sock_def_wakeup;         sk->sk_data_ready  =       sock_def_readable;         sk->sk_write_space         =       sock_def_write_space;         sk->sk_error_report         =       sock_def_error_report;         sk->sk_destruct                 =       sock_def_destruct;          sk->sk_frag.page     =       NULL;         sk->sk_frag.offset   =       0;         sk->sk_peek_off                =       -1;          sk->sk_peer_pid    =       NULL;         sk->sk_peer_cred    =       NULL;         sk->sk_write_pending     =       0;         sk->sk_rcvlowat                =       1;         sk->sk_rcvtimeo                =       MAX_SCHEDULE_TIMEOUT;         sk->sk_sndtimeo               =       MAX_SCHEDULE_TIMEOUT;          sk->sk_stamp= ktime_set(-1L, 0); #ifdef CONFIG_NET_RX_BUSY_POLL         sk->sk_napi_id                   =       0;         sk->sk_ll_usec          =       sysctl_net_busy_read;#endif          sk->sk_max_pacing_rate= ~0U;         sk->sk_pacing_rate= ~0U;         /*          * Before updating sk_refcnt, we must commitprior changes to memory          * (Documentation/RCU/rculist_nulls.txt fordetails)          */         smp_wmb();         atomic_set(&sk->sk_refcnt,1);//sk的引用计数加1         atomic_set(&sk->sk_drops,0);}(2)static inttcp_v4_init_sock(struct sock *sk){         struct inet_connection_sock *icsk =inet_csk(sk);//基于上面的原因分析,其实这里可以进行强制的类型转换         tcp_init_sock(sk);//进行tcp相关变量的初始化工作         icsk->icsk_af_ops =&ipv4_specific;#ifdefCONFIG_TCP_MD5SIG         tcp_sk(sk)->af_specific =&tcp_sock_ipv4_specific;#endif          return 0;}
10-15 14:36
查看更多