Author: Tony
今天一个可爱的同事问我同一个socket能否bind多个port?说实话,真没这么玩过。本着严谨的态度,决定研究一下这样是否可以。
说明本内容针对的内核版本是SLES 11 SP2 3.0.13-0.27
首先分析系统调用函数bind,内核层实现在net/socket.c中:
点击(此处)折叠或打开
- SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
- {
- struct socket *sock;
- struct sockaddr_storage address;
- int err, fput_needed;
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (sock) {
- err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address);
- if (err >= 0) {
- err = security_socket_bind(sock,
- (struct sockaddr *)&address,
- addrlen);
- if (!err)
- /*这里调用对应proto_ops的bind,这里以tcp为例说明
- proto_ops实现了从协议无关的套接口层到协议相关的传输层的转接
- */
- err = sock->ops->bind(sock,
- (struct sockaddr *)
- &address, addrlen);
- }
- fput_light(sock->file, fput_needed);
- }
- return err;
- }
点击(此处)折叠或打开
- static struct inet_protosw inetsw_array[] =
- {
- {
- .type = SOCK_STREAM,
- .protocol = IPPROTO_TCP,
- .prot = &tcp_prot,
- /*这里指定了tcp传输层协议的proto_ops*/
- .ops = &inet_stream_ops,
- .no_check = 0,
- .flags = INET_PROTOSW_PERMANENT |
- INET_PROTOSW_ICSK,
- },
- {
- .type = SOCK_DGRAM,
- .protocol = IPPROTO_UDP,
- .prot = &udp_prot,
- .ops = &inet_dgram_ops,
- .no_check = UDP_CSUM_DEFAULT,
- .flags = INET_PROTOSW_PERMANENT,
- },
- {
- .type = SOCK_DGRAM,
- .protocol = IPPROTO_ICMP,
- .prot = &ping_prot,
- .ops = &inet_dgram_ops,
- .no_check = UDP_CSUM_DEFAULT,
- .flags = INET_PROTOSW_REUSE,
- },
- {
- .type = SOCK_RAW,
- .protocol = IPPROTO_IP, /* wild card */
- .prot = &raw_prot,
- .ops = &inet_sockraw_ops,
- .no_check = UDP_CSUM_DEFAULT,
- .flags = INET_PROTOSW_REUSE,
- }
- };
点击(此处)折叠或打开
- const struct proto_ops inet_stream_ops = {
- .family = PF_INET,
- .owner = THIS_MODULE,
- .release = inet_release,
- /*这里指定了tcp协议proto_ops的bind函数为inet_bind*/
- .bind = inet_bind,
- .connect = inet_stream_connect,
- .socketpair = sock_no_socketpair,
- .accept = inet_accept,
- .getname = inet_getname,
- .poll = tcp_poll,
- .ioctl = inet_ioctl,
- .listen = inet_listen,
- .shutdown = inet_shutdown,
- .setsockopt = sock_common_setsockopt,
- .getsockopt = sock_common_getsockopt,
- .sendmsg = inet_sendmsg,
- .recvmsg = inet_recvmsg,
- .mmap = sock_no_mmap,
- .sendpage = inet_sendpage,
- .splice_read = tcp_splice_read,
- #ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
- .compat_ioctl = inet_compat_ioctl,
- #endif
- };
点击(此处)折叠或打开
- int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
- {
- struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
- struct sock *sk = sock->sk;
- struct inet_sock *inet = inet_sk(sk);
- unsigned short snum;
- int chk_addr_ret;
- int err;
- /* If the socket has its own bind function then use it. (RAW) */
- if (sk->sk_prot->bind) {
- err = sk->sk_prot->bind(sk, uaddr, addr_len);
- goto out;
- }
- err = -EINVAL;
- if (addr_len < sizeof(struct sockaddr_in))
- goto out;
- if (addr->sin_family != AF_INET) {
- /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
- * only if s_addr is INADDR_ANY.
- */
- err = -EAFNOSUPPORT;
- if (addr->sin_family != AF_UNSPEC ||
- addr->sin_addr.s_addr != htonl(INADDR_ANY))
- goto out;
- }
- chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
- /* Not specified by any standard per-se, however it breaks too
- * many applications when removed. It is unfortunate since
- * allowing applications to make a non-local bind solves
- * several problems with systems using dynamic addressing.
- * (ie. your servers still start up even if your ISDN link
- * is temporarily down)
- */
- err = -EADDRNOTAVAIL;
- if (!sysctl_ip_nonlocal_bind &&
- !(inet->freebind || inet->transparent) &&
- addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
- chk_addr_ret != RTN_LOCAL &&
- chk_addr_ret != RTN_MULTICAST &&
- chk_addr_ret != RTN_BROADCAST)
- goto out;
- snum = ntohs(addr->sin_port);
- err = -EACCES;
- if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
- goto out;
- /* We keep a pair of addresses. rcv_saddr is the one
- * used by hash lookups, and saddr is used for transmit.
- *
- * In the BSD API these are the same except where it
- * would be illegal to use them (multicast/broadcast) in
- * which case the sending device address is used.
- */
- lock_sock(sk);
- /* Check these errors (active socket, double bind). */
- err = -EINVAL;
- /*这里会校验要bind的sock是不是在TCP_CLOSE状态(未使用),以及当前
- sock是否已经bind相应的端口。
- 当该sock第一次bind的时候,inet->inet_num肯定是0,当bind一次后,
- 这个inet_num就变为了非零,从而一个socket最多bind一个端口,如果
- 尝试bind多个端口将会返回错误22(EINVAL)
- 下面我们看一下,inet_num是在什么时候被复制的?
- */
- if (sk->sk_state != TCP_CLOSE || inet->inet_num)
- goto out_release_sock;
- inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
- if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
- inet->inet_saddr = 0; /* Use device */
- /* Make sure we are allowed to bind here. */
- /* 这个函数会根据传入的参数,选择一个可用的端口进行bind,
- sk->sk_prot就是网络控制块sock的操作函数,struct proto
- 实现了传输层到网络层的转换。
- tcp sock的get_port函数是inet_csk_get_port函数
- tcp_prot(struct proto)在net/ipv4/tcp_ipv4.c中定义
- */
- if (sk->sk_prot->get_port(sk, snum)) {
- inet->inet_saddr = inet->inet_rcv_saddr = 0;
- err = -EADDRINUSE;
- goto out_release_sock;
- }
- if (inet->inet_rcv_saddr)
- sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
- if (snum)
- sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
- inet->inet_sport = htons(inet->inet_num);
- inet->inet_daddr = 0;
- inet->inet_dport = 0;
- sk_dst_reset(sk);
- err = 0;
- out_release_sock:
- release_sock(sk);
- out:
- return err;
- }
点击(此处)折叠或打开
- /* Obtain a reference to a local port for the given sock,
- * if snum is zero it means select any available local port.
- */
- int inet_csk_get_port(struct sock *sk, unsigned short snum)
- {
- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
- struct inet_bind_hashbucket *head;
- struct hlist_node *node;
- struct inet_bind_bucket *tb;
- int ret, attempts = 5;
- struct net *net = sock_net(sk);
- int smallest_size = -1, smallest_rover;
- local_bh_disable();
- /*从这里可用看的,如果传入的端口为0,那么内核会自动选择一个可用的端口*/
- if (!snum) {
- int remaining, rover, low, high;
- again:
- inet_get_local_port_range(&low, &high);
- remaining = (high - low) + 1;
- smallest_rover = rover = net_random() % remaining + low;
- smallest_size = -1;
- do {
- if (inet_is_reserved_local_port(rover))
- goto next_nolock;
- head = &hashinfo->bhash[inet_bhashfn(net, rover,
- hashinfo->bhash_size)];
- spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == rover) {
- if (tb->fastreuse > 0 &&
- sk->sk_reuse &&
- sk->sk_state != TCP_LISTEN &&
- (tb->num_owners < smallest_size || smallest_size == -1)) {
- smallest_size = tb->num_owners;
- smallest_rover = rover;
- if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
- spin_unlock(&head->lock);
- snum = smallest_rover;
- goto have_snum;
- }
- }
- goto next;
- }
- break;
- next:
- spin_unlock(&head->lock);
- next_nolock:
- if (++rover > high)
- rover = low;
- } while (--remaining > 0);
- /* Exhausted local port range during search? It is not
- * possible for us to be holding one of the bind hash
- * locks if this test triggers, because if 'remaining'
- * drops to zero, we broke out of the do/while loop at
- * the top level, not from the 'break;' statement.
- */
- ret = 1;
- if (remaining <= 0) {
- if (smallest_size != -1) {
- snum = smallest_rover;
- goto have_snum;
- }
- goto fail;
- }
- /* OK, here is the one we will use. HEAD is
- * non-NULL and we hold it's mutex.
- */
- snum = rover;
- } else {
- have_snum:
- head = &hashinfo->bhash[inet_bhashfn(net, snum,
- hashinfo->bhash_size)];
- spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == snum)
- goto tb_found;
- }
- tb = NULL;
- goto tb_not_found;
- tb_found:
- if (!hlist_empty(&tb->owners)) {
- if (tb->fastreuse > 0 &&
- sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
- smallest_size == -1) {
- goto success;
- } else {
- ret = 1;
- if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
- if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
- smallest_size != -1 && --attempts >= 0) {
- spin_unlock(&head->lock);
- goto again;
- }
- goto fail_unlock;
- }
- }
- }
- tb_not_found:
- ret = 1;
- if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
- net, head, snum)) == NULL)
- goto fail_unlock;
- if (hlist_empty(&tb->owners)) {
- if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
- tb->fastreuse = 1;
- else
- tb->fastreuse = 0;
- } else if (tb->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
- tb->fastreuse = 0;
- success:
- if (!inet_csk(sk)->icsk_bind_hash)
- /*到这里说明端口是可用的了,需要将该sk挂接到该端口的owners链表中
- 我们看一下这个函数的实现。
- */
- inet_bind_hash(sk, tb, snum);
- WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
- ret = 0;
- fail_unlock:
- spin_unlock(&head->lock);
- fail:
- local_bh_enable();
- return ret;
- }
点击(此处)折叠或打开
- void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
- const unsigned short snum)
- {
- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
- atomic_inc(&hashinfo->bsockets);
- /*
- 在这里将sock的inet_num置为选中的端口。
- */
- inet_sk(sk)->inet_num = snum;
- sk_add_bind_node(sk, &tb->owners);
- tb->num_owners++;
- inet_csk(sk)->icsk_bind_hash = tb;
- }
综上的分析,可用看出同一个socket只可以bind最多一个端口,如果bind超过1个端口,会返回错误22(EINVAL)。