前言: 对于Linux内核的Socket系列文章都是依据于:Linux-3.14.5的版本内核分析,对于文中的注释和问题的说明也参考了网络上经典分析文章,对他们奉献表示感谢! 转载请标明:http://blog.chinaunix.net/uid-20788636-id-4408261.html1. Socket内核调用数SYSCALL_DEFINE3Socket的创建是在用户空间调用socket系统函数完成的,创建一个Socket返回一个文件描述符fd,内核的系统调用接口为SYSCALL_DEFINE3(socket,int, family, int, type, int, protocol),在net/socket.c文件中,下面我们看一下内核中的源码实现。SYSCALL_DEFINE3(socket, int, family, int,type, int, protocol){ intretval; structsocket *sock; intflags; /*Check the SOCK_* constants for consistency. 下面这些都是进行各种的检查操作*/ BUILD_BUG_ON(SOCK_CLOEXEC!= O_CLOEXEC); BUILD_BUG_ON((SOCK_MAX| SOCK_TYPE_MASK) != SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_CLOEXEC& SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_NONBLOCK& SOCK_TYPE_MASK); flags= type & ~SOCK_TYPE_MASK; if(flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return-EINVAL; type&= SOCK_TYPE_MASK; if(SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags= (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;//调用创建socket的函数 retval= sock_create(family, type, protocol, &sock);//------参考下面的分析 if(retval gotoout; retval= sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); if(retval gotoout_release; out: /*It may be already another descriptor 8) Not kernel problem. */ returnretval; out_release: sock_release(sock); returnretval;}1.1 socket_create函数对于sock_create(family,type, protocol, &sock)函数调用的是包囊函数,__sock_create(current->nsproxy->net_ns,family, type, protocol, res, 0);对于__sock_create函数的定义如下:int __sock_create(struct net *net, intfamily, int type, int protocol, struct socket **res, int kern){ interr; structsocket *sock; conststruct net_proto_family *pf; /* * Check protocol is in range 检查协议的范围,现在内核定义的最大范围为41,这里的family指的是AF_INET6,AF_INET协议簇 #define NPROTO AF_MAX #define AF_MAX 41 /* For now.. */ */ if(family = NPROTO) return-EAFNOSUPPORT; if(type = SOCK_MAX)//这里的type是socket的类型例如SOCK_STREAM return-EINVAL; /*Compatibility. This uglymoron is moved from INET layer tohere to avoid deadlock in module load. */ if(family == PF_INET && type == SOCK_PACKET) {//如果是该类型的socket,对family进行重新的赋值 staticint warned;//这里自动初始化为0, if(!warned) { warned= 1; printk(KERN_INFO"%s uses obsolete (PF_INET,SOCK_PACKET)\n", current->comm); } family= PF_PACKET;//赋值为PF_PACKET } err= security_socket_create(family, type, protocol, kern); if(err) returnerr; /* * Allocatethe socket and allow the family to set things up. if * theprotocol is 0, the family is instructed to select an appropriate * default.这里调用sock_alloc分配sock,见下面的分析 */ sock = sock_alloc(); if(!sock) { net_warn_ratelimited("socket:no more sockets\n"); return-ENFILE; /* Not exactly a match,but its the closest posix thing */ } sock->type= type; #ifdef CONFIG_MODULES /*Attempt to load a protocol module if the find failed. * * 12/09/1996 Marcin: But! this makes REALLYonly sense, if the user * requested real, full-featured networkingsupport upon configuration. * Otherwise module support will break! */ if(rcu_access_pointer(net_families[family]) == NULL) request_module("net-pf-%d",family);#endif rcu_read_lock(); pf= rcu_dereference(net_families[family]); err= -EAFNOSUPPORT; if(!pf) gotoout_release; /* * We will call the ->create function, thatpossibly is in a loadable * module, so we have to bump that loadablemodule refcnt first. */ if(!try_module_get(pf->owner)) gotoout_release; /*Now protected by module ref count */ rcu_read_unlock();/*static const structnet_proto_family inet_family_ops = { .family = PF_INET, .create = inet_create, .owner =THIS_MODULE,};这里根据注册的family类型,调用不同的create函数,这里就是调用inet_ctreate*/ err = pf->create(net, sock, protocol, kern); if(err gotoout_module_put; /* * Now to bump the refcnt of the [loadable]module that owns this * socket at sock_release time we decrement itsrefcnt. */ if(!try_module_get(sock->ops->owner)) gotoout_module_busy; /* * Now that we're done with the ->createfunction, the [loadable] * module can have its refcnt decremented */ module_put(pf->owner); err= security_socket_post_create(sock, family, type, protocol, kern); if(err) gotoout_sock_release; *res= sock; return0;out_module_busy: err= -EAFNOSUPPORT;out_module_put: sock->ops= NULL; module_put(pf->owner);out_sock_release: sock_release(sock); returnerr; out_release: rcu_read_unlock(); gotoout_sock_release;}1.1.1 sock_alloc函数sock_alloc函数用于分配一个socket结构体,这这里涉及了inode结构以及在分配完成后返回的地址指针。static struct socket *sock_alloc(void){ structinode *inode; structsocket *sock; /*下面的new_inode_pseudo函数是分配一个新的inode结构体,但在实际分配过程中,分配了一个socket_alloc结构体,返回d的是inode地址,struct socket_alloc { struct socket socket; struct inode vfs_inode;};*/ inode= new_inode_pseudo(sock_mnt->mnt_sb);//sock_mnt哪里进行初始的,请看下面的分析-----(1) if(!inode) returnNULL; sock= SOCKET_I(inode);//该宏根据返回的inode获取到分配的socket_alloc指针 kmemcheck_annotate_bitfield(sock,type); /*下面是对inode变量进行初始化操作,*/ inode->i_ino= get_next_ino(); inode->i_mode= S_IFSOCK | S_IRWXUGO; inode->i_uid= current_fsuid();//用户ID,在后面调用bind系统调用时会进行对比 inode->i_gid= current_fsgid();//组ID inode->i_op= &sockfs_inode_ops; this_cpu_add(sockets_in_use,1); returnsock;}(1)对于sock_mnt->mnt_sb的赋值和分配过程如下:在sock_init函数中对socket类型的文件系统进行注册static struct file_system_type sock_fs_type = { .name= "sockfs", .mount= sockfs_mount, .kill_sb= kill_anon_super,};static int __init sock_init(void){ interr; /* * Initialize the network sysctl infrastructure. */ err= net_sysctl_init(); if(err) gotoout; /* * Initialize skbuff SLAB cache */ skb_init(); /* * Initialize the protocols module. */ init_inodecache(); /*下面的函数进行文件系统的注册*/ err = register_filesystem(&sock_fs_type); if(err) gotoout_fs;/*下面的函数挂载文件系统*/ sock_mnt = kern_mount(&sock_fs_type); if(IS_ERR(sock_mnt)) { err= PTR_ERR(sock_mnt); gotoout_mount; } /*The real protocol initialization is performed in later initcalls. */ #ifdef CONFIG_NETFILTER err= netfilter_init(); if(err) gotoout;#endif #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING skb_timestamping_init();#endif out: returnerr; out_mount: unregister_filesystem(&sock_fs_type);out_fs: gotoout;} (2)new_inode_pseudo函数创建一个inode,并初始化inode的i_state变量和inode->i_sb_list链表,实际的分配函数为alloc_inode函数struct inode *new_inode_pseudo(structsuper_block *sb){ struct inode *inode = alloc_inode(sb); if(inode) { spin_lock(&inode->i_lock); inode->i_state = 0; spin_unlock(&inode->i_lock); INIT_LIST_HEAD(&inode->i_sb_list); } returninode;} alloc_inode分配一个inode节点,static struct inode *alloc_inode(structsuper_block *sb){ structinode *inode; if(sb->s_op->alloc_inode)/*如果当前文件系统的超级块,有自己的分配inode的函数,则调用它自己的分配函数,否则从公用的高速缓存中分配一个inode.对于sokcet来说,在socket.c文件中,调用的函数为sock_alloc_inodestatic const structsuper_operations sockfs_ops = { .alloc_inode =sock_alloc_inode, .destroy_inode =sock_destroy_inode, .statfs =simple_statfs,};*/ inode = sb->s_op->alloc_inode(sb); else inode= kmem_cache_alloc(inode_cachep, GFP_KERNEL); if(!inode) returnNULL;/*对inode结构进行初始化*/ if(unlikely(inode_init_always(sb, inode))) { if(inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); else kmem_cache_free(inode_cachep,inode); returnNULL; } returninode;} (3) 下面是sock_alloc_inode函数,在socket.c文件中static struct inode*sock_alloc_inode(struct super_block *sb){ struct socket_alloc *ei; structsocket_wq *wq; /*下面的函数分配struct socket_alloc结构体,这里是怎么分配的呢?参考下面的说明 */ ei= kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL); if(!ei) returnNULL; wq= kmalloc(sizeof(*wq), GFP_KERNEL); if(!wq) { kmem_cache_free(sock_inode_cachep,ei); returnNULL; } init_waitqueue_head(&wq->wait); wq->fasync_list= NULL; RCU_INIT_POINTER(ei->socket.wq,wq); ei->socket.state= SS_UNCONNECTED; ei->socket.flags= 0; ei->socket.ops= NULL; ei->socket.sk= NULL; ei->socket.file= NULL; return&ei->vfs_inode; //这里返回的是struct inode vfs_inode;} 备注说明:在分配函数sock_alloc_inode中调用了ei =kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);这里分配的大小为socket_alloc大小,下面分析一下是如果分配该大小的?init_inodecache函数中(net/socket.c),对其进行了高速缓存的分配操作,定义在socket.c文件中,这里分配的大小为socket_alloc,但返回是的socket_alloc结构体中的struct inode vfs_inode;变量。该函数在sock_init函数中被调用static int init_inodecache(void){ sock_inode_cachep =kmem_cache_create("sock_inode_cache", sizeof(struct socket_alloc), 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD), init_once); if(sock_inode_cachep == NULL) return-ENOMEM; return0;}1.1.2 inet_create函数 在socket_create函数中调用pf->create这里的函数指针为inet_create。在文件/net/af_inet.c中static int inet_create(struct net *net,struct socket *sock, int protocol, int kern){ structsock *sk; structinet_protosw *answer; structinet_sock *inet; structproto *answer_prot; unsignedchar answer_flags; charanswer_no_check; inttry_loading_module = 0; interr; sock->state= SS_UNCONNECTED; /*Look for the requested type/protocol pair. */lookup_protocol: err= -ESOCKTNOSUPPORT; rcu_read_lock();/* 从inetsw中根据类型、协议查找相应的socket interface也就是 inet_protosw */ list_for_each_entry_rcu(answer,&inetsw[sock->type], list) { err= 0; /*Check the non-wild match. */ if(protocol == answer->protocol) { if(protocol != IPPROTO_IP) break; }else { /*Check for the two wild cases. */ if(IPPROTO_IP == protocol) { protocol= answer->protocol; break; } if(IPPROTO_IP == answer->protocol) break; } err= -EPROTONOSUPPORT; }/*如果没有找到,尝试加载模块*/ if(unlikely(err)) { if(try_loading_module rcu_read_unlock(); /* * Be more specific, e.g.net-pf-2-proto-132-type-1 *(net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) */ if (++try_loading_module ==1) request_module("net-pf-%d-proto-%d-type-%d", PF_INET, protocol, sock->type); /* * Fall back to generic, e.g.net-pf-2-proto-132 * (net-pf-PF_INET-proto-IPPROTO_SCTP) */ else request_module("net-pf-%d-proto-%d", PF_INET, protocol); gotolookup_protocol; }else gotoout_rcu_unlock; } err= -EPERM; if(sock->type == SOCK_RAW && !kern && !ns_capable(net->user_ns, CAP_NET_RAW)) gotoout_rcu_unlock; sock->ops= answer->ops; answer_prot= answer->prot; answer_no_check= answer->no_check; answer_flags= answer->flags; rcu_read_unlock(); WARN_ON(answer_prot->slab== NULL);/* sk_alloc表面上是生成一个sock的结构体,但是实际上对于tcp来说是一个tcp_sock的大小的结构体,这样就可以使用inet_sk(sk);进行强制的类型转换,具体是怎么分配的是tcp_sock大小的,在后续进行分析*/ err= -ENOBUFS; sk= sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); if(sk == NULL) gotoout; err= 0; sk->sk_no_check= answer_no_check; if(INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse= SK_CAN_REUSE; inet= inet_sk(sk); inet->is_icsk= (INET_PROTOSW_ICSK & answer_flags) != 0; inet->nodefrag= 0; if(SOCK_RAW == sock->type) { inet->inet_num= protocol; if(IPPROTO_RAW == protocol) inet->hdrincl= 1; } if(net->ipv4.sysctl_ip_no_pmtu_disc) inet->pmtudisc= IP_PMTUDISC_DONT; else inet->pmtudisc= IP_PMTUDISC_WANT; inet->inet_id= 0; /*对sk结构体中的变量进行初始化操作,*/ sock_init_data(sock, sk);------------------(1) sk->sk_destruct = inet_sock_destruct; sk->sk_protocol = protocol; sk->sk_backlog_rcv= sk->sk_prot->backlog_rcv; inet->uc_ttl = -1; inet->mc_loop = 1; inet->mc_ttl = 1; inet->mc_all = 1; inet->mc_index = 0; inet->mc_list = NULL; inet->rcv_tos = 0; sk_refcnt_debug_inc(sk); if(inet->inet_num) { /*It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */ inet->inet_sport= htons(inet->inet_num); /*Add to protocol hash chains. */ sk->sk_prot->hash(sk); } if(sk->sk_prot->init) { err= sk->sk_prot->init(sk);//如果是tcp的话,这里就是tcp_v4_init_sock--------(2) if(err) sk_common_release(sk); }out: returnerr;out_rcu_unlock: rcu_read_unlock(); gotoout;} (1)sock_init_data函数分析void sock_init_data(struct socket *sock,struct sock *sk){ skb_queue_head_init(&sk->sk_receive_queue); skb_queue_head_init(&sk->sk_write_queue); skb_queue_head_init(&sk->sk_error_queue);#ifdef CONFIG_NET_DMA skb_queue_head_init(&sk->sk_async_wait_queue);#endif sk->sk_send_head = NULL; /*初始化sk定时器*/ init_timer(&sk->sk_timer); sk->sk_allocation = GFP_KERNEL; sk->sk_rcvbuf = sysctl_rmem_default; sk->sk_sndbuf = sysctl_wmem_default; sk->sk_state = TCP_CLOSE; //初始化sk_state=TCP_CLOSE状态,在后面是的系统调用中会进行判断 sk_set_socket(sk,sock);// sk->sk_socket = sock; 设置sk中指向socket的指针 sock_set_flag(sk,SOCK_ZAPPED);//设置SOKCET的flag位,表明该sokcet已经绑定了一个名字,该标志位没有搞明白什么意思? if(sock) { sk->sk_type = sock->type; sk->sk_wq = sock->wq; sock->sk = sk; // struct socket *sock 的sk指向sock }else sk->sk_wq = NULL; spin_lock_init(&sk->sk_dst_lock); rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys+ sk->sk_family, af_family_clock_key_strings[sk->sk_family]); sk->sk_state_change = sock_def_wakeup; sk->sk_data_ready = sock_def_readable; sk->sk_write_space = sock_def_write_space; sk->sk_error_report = sock_def_error_report; sk->sk_destruct = sock_def_destruct; sk->sk_frag.page = NULL; sk->sk_frag.offset = 0; sk->sk_peek_off = -1; sk->sk_peer_pid = NULL; sk->sk_peer_cred = NULL; sk->sk_write_pending = 0; sk->sk_rcvlowat = 1; sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_stamp= ktime_set(-1L, 0); #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; sk->sk_ll_usec = sysctl_net_busy_read;#endif sk->sk_max_pacing_rate= ~0U; sk->sk_pacing_rate= ~0U; /* * Before updating sk_refcnt, we must commitprior changes to memory * (Documentation/RCU/rculist_nulls.txt fordetails) */ smp_wmb(); atomic_set(&sk->sk_refcnt,1);//sk的引用计数加1 atomic_set(&sk->sk_drops,0);}(2)static inttcp_v4_init_sock(struct sock *sk){ struct inet_connection_sock *icsk =inet_csk(sk);//基于上面的原因分析,其实这里可以进行强制的类型转换 tcp_init_sock(sk);//进行tcp相关变量的初始化工作 icsk->icsk_af_ops =&ipv4_specific;#ifdefCONFIG_TCP_MD5SIG tcp_sk(sk)->af_specific =&tcp_sock_ipv4_specific;#endif return 0;}