1. 前言

    上一篇博客主要记录了socket创建的流程,本篇博客主要记录linux服务器端套接字创建成功后,需通过bind绑定一个本地地址和端口的工作流程。

百科注解:

     将一本地地址与一套接口捆绑。本函数适用于未连接的数据报或流类套接口,在connect()或listen()调用前使用。当用socket()创建套接口后,它便存在于一个名字空间(地址族)中,但并未赋名。bind()函数通过给一个未命名套接口分配一个本地名字来为套接口建立本地捆绑(主机地址/端口号)。

2. 应用层使用bind函数

表头文件  #include<sys/types.h>
#include<sys/socket.h>

定义函数:int bind(int sockfd,struct sockaddr * my_addr,int addrlen);
函数说明:bind()用来设置给参数sockfd的socket一个名称。此名称由参数my_addr指向一sockaddr结构,对于不同的socket domain定义了一个通用的数据结构
struct sockaddr
{
unsigned short int sa_family;
char sa_data[14];
};
sa_family 为调用socket()时的domain参数,即AF_xxxx值。
sa_data 最多使用14个字符长度。
此sockaddr结构会因使用不同的socket domain而有不同结构定义,例如使用AF_INET domain,其socketaddr结构定义便为
struct socketaddr_in
{
unsigned short int sin_family;
uint16_t sin_port;
struct in_addr sin_addr;
unsigned char sin_zero[8];
};
struct in_addr
{
uint32_t s_addr;
};
sin_family 即为sa_family
sin_port 为使用的port编号
sin_addr.s_addr 为IP 地址
sin_zero 未使用。

参数 addrlen为sockaddr的结构长度。

返回值 成功则返回0,失败返回-1,错误原因存于errno中。

错误代码 EBADF 参数sockfd 非合法socket处理代码。
EACCESS 权限不足
ENOTSOCK 参数sockfd为一文件描述词,非socket。

3. bind内核源码分析

SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
struct socket *sock;
struct sockaddr_storage address;
int err, fput_needed;

sock = sockfd_lookup_light(fd, &err, &fput_needed); //通过文件描述符找到socket
if (sock) {
err = move_addr_to_kernel(umyaddr, addrlen, &address); //从用户空间的地址拷贝到内核空间
if (err >= 0) {
err = security_socket_bind(sock, //LSM机制
(struct sockaddr *)&address,
addrlen);
if (!err)
//对应结构体inet_stream_ops的成员 inet_bind
err = sock->ops->bind(sock,
(struct sockaddr *)
&address, addrlen);
}
fput_light(sock->file, fput_needed);
}
return err;
}

       在bind函数内部主要完成用户空间的参数(地址和端口)复制到内核空间,然后通过函数指针sock->ops->bind(...)调用inet_bind函数进行端口和地址的绑定,关于inet_bind函数源码如下

4. inet_bind()

int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct sock *sk = sock->sk; //在socket创建里面,动态分配了sk,详见函数 inet_create(...)
struct inet_sock *inet = inet_sk(sk); //在socket创建里面,动态分配了sk,详见函数 inet_create(...)
struct net *net = sock_net(sk); //sk->sk_net指向的是命名空间,在socket创建时初始化的,详见函数 sk_alloc()
unsigned short snum;
int chk_addr_ret;
int err;

/* If the socket has its own bind function then use it. (RAW) */
if (sk->sk_prot->bind) { //sk->sk_prot指向的是tcp_prot结构体,但是该结构体sk->sk_prot->bind为NULL
err = sk->sk_prot->bind(sk, uaddr, addr_len); //
goto out;
}
err = -EINVAL;
if (addr_len < sizeof(struct sockaddr_in))
goto out;

//判定协议族是否是AF_INET,如果不是AF_INET就进行判断
if (addr->sin_family != AF_INET) {
/* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
* only if s_addr is INADDR_ANY.
*/
err = -EAFNOSUPPORT;
//到这里协议族不等于AF_INET,那么协议族也不等于AF_UNSPEC 或 地址也不为无效就直接退出
if (addr->sin_family != AF_UNSPEC ||
addr->sin_addr.s_addr != htonl(INADDR_ANY))
goto out;
}

//这个函数内部非常重要,函数返回的是地址的类型[RT_SCOPE_NOWHERE...],详见流程图
chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);

/* Not specified by any standard per-se, however it breaks too
* many applications when removed. It is unfortunate since
* allowing applications to make a non-local bind solves
* several problems with systems using dynamic addressing.
* (ie. your servers still start up even if your ISDN link
* is temporarily down)
*/
err = -EADDRNOTAVAIL;
if (!sysctl_ip_nonlocal_bind &&
!(inet->freebind || inet->transparent) &&
addr->sin_addr.s_addr != htonl(INADDR_ANY) && //地址有效
chk_addr_ret != RTN_LOCAL && //返回的类型不为本地路由
chk_addr_ret != RTN_MULTICAST && //不为多路广播
chk_addr_ret != RTN_BROADCAST) //不为广播
goto out;

snum = ntohs(addr->sin_port); //网络端口转主机端口(涉及到大、小端模式)
err = -EACCES;
if (snum && snum < PROT_SOCK && //PROT_SOCK=1024,小于1024的端口为特殊端口,用户不能申请
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
goto out;

/* We keep a pair of addresses. rcv_saddr is the one
* used by hash lookups, and saddr is used for transmit.
* 我们有一对地址,rcv_saddr被用来hash表查找,saddr被用来传输。
* In the BSD API these are the same except where it
* would be illegal to use them (multicast/broadcast) in
* which case the sending device address is used.
*/
lock_sock(sk);

/* Check these errors (active socket, double bind). */
err = -EINVAL;
if (sk->sk_state != TCP_CLOSE || inet->inet_num) //socket的状态不为TCP_CLOSE || 端口数?
goto out_release_sock;

//将本机绑定的地址作为inet_rcv_saddr和inet_saddr地址
inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) //是否为多路广播和广播地址
inet->inet_saddr = 0; /* Use device */

/* Make sure we are allowed to bind here. */
if (sk->sk_prot->get_port(sk, snum)) { //sk->sk_prot指向的是tcp_prot结构体, get_port=inet_csk_get_port()
inet->inet_saddr = inet->inet_rcv_saddr = 0;
err = -EADDRINUSE;
goto out_release_sock;
}

if (inet->inet_rcv_saddr)
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
if (snum) //端口有效
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
inet->inet_sport = htons(inet->inet_num); //inet->inet_num 为端口,在上面的get_port()函数内部进行了绑定
//设置目的端口和地址为0
inet->inet_daddr = 0;
inet->inet_dport = 0;
sk_dst_reset(sk);
err = 0;
out_release_sock:
release_sock(sk);
out:
return err;
}
EXPORT_SYMBOL(inet_bind);

在inet_bind()函数内部主要完成两个工作

a、校验地址类型

//这个函数内部非常重要,函数返回的是地址的类型[RT_SCOPE_NOWHERE...],详见流程图
chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);

b、绑定端口

sk->sk_prot->get_port(sk, snum) //inet_csk_get_port()

下面逐个分析这两个函数。

5. inet_addr_type()

linux 服务器端bind(4)_链表

unsigned int inet_addr_type(struct net *net, __be32 addr)
{
return __inet_dev_addr_type(net, NULL, addr);
}
EXPORT_SYMBOL(inet_addr_type);
static inline unsigned int __inet_dev_addr_type(struct net *net,
const struct net_device *dev,
__be32 addr)
{
struct flowi4 fl4 = { .daddr = addr };
struct fib_result res;
unsigned int ret = RTN_BROADCAST;
struct fib_table *local_table;

//本地广播地址
if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
return RTN_BROADCAST;

/*IP多路广播 [1] 的实现是通过一组特殊的地址实现的叫做D类地址(224.0.0.0-239.255.255.255)。
这些IP地址是特殊的因为它们不对应具体的主机,它们对应groups(组)或称channels(频道)。其中有
些地址有特殊的目的。很象人们熟知的TCP和UDP中的端口。其他地址是由用户定义的。常见的IP地址
举例如下:
所有IP系统:224.0.0.1
所有IP路由:224.0.0.2
网络时间协议:224.0.1.1
SGI的DogFight游戏:224.0.1.2
Session Directory:224.2.127.254
*/
if (ipv4_is_multicast(addr))
return RTN_MULTICAST;

/*
先查询本地地址路由表
本地路由表保存本地地址,多播等,属于需要发送到本机的地址信息。
*/
local_table = fib_get_table(net, RT_TABLE_LOCAL);
if (local_table) {
ret = RTN_UNICAST; //网关或者是路由
rcu_read_lock();
if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
if (!dev || dev == res.fi->fib_dev)
ret = res.type;
}
rcu_read_unlock();
}
return ret;
}

在这个函数内部主要完成地址类型的校验,其中涉及到两个重要的函数

a、通过RT_TABLE_LOCAL查找对应的hash链表,在将hash链表的对应的fib table返回,即为local_table;

local_table = fib_get_table(net, RT_TABLE_LOCAL);
//通过形参id,匹配hash链表,成功就返回tb,否则NULL
struct fib_table *fib_get_table(struct net *net, u32 id)
{
struct fib_table *tb;
struct hlist_head *head;
unsigned int h;

if (id == 0)
id = RT_TABLE_MAIN;
h = id & (FIB_TABLE_HASHSZ - 1); //h = id & 0xff

rcu_read_lock();
head = &net->ipv4.fib_table_hash[h]; //看下这里是什么时候赋值的
hlist_for_each_entry_rcu(tb, head, tb_hlist) { //遍历 net->ipv4.fib_table_hash 链表,寻找匹配成功的路由表id
if (tb->tb_id == id) {
rcu_read_unlock();
return tb;
}
}
rcu_read_unlock();
return NULL;
}

其中head = &net->ipv4.fib_table_hash[h]是在下面分配的

//网络命名空间内部的 fib_table_hash 分配流程
inet_init-->ip_init-->ip_rt_init-->ip_fib_init-->register_pernet_subsys(&fib_net_ops)-->fib_net_ops.fib_net_init-->ip_fib_net_init-->[net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL){分配size=FIB_TABLE_HASHSZ个指针}]-->fib4_rules_init

static int __net_init fib4_rules_init(struct net *net)
{
struct fib_table *local_table, *main_table;

local_table = fib_trie_table(RT_TABLE_LOCAL); //获取local_table表
if (local_table == NULL)
return -ENOMEM;

main_table = fib_trie_table(RT_TABLE_MAIN); //获取RT_TABLE_MAIN表
if (main_table == NULL)
goto fail;

//将net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]添加到local_table->tb_hlist链表上
hlist_add_head_rcu(&local_table->tb_hlist,
&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);

//将net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]main_table->tb_hlist链表上
hlist_add_head_rcu(&main_table->tb_hlist,
&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
return 0;

fail:
kfree(local_table);
return -ENOMEM;
}

b、根据a、中返回的local_table查询当前地址所在路由表中的地址类型

if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
if (!dev || dev == res.fi->fib_dev)
ret = res.type;
...
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
struct fib_result *res, int fib_flags)
{
struct trie *t = (struct trie *) tb->tb_data; //通过tb->tb_data,获取fib 为trie的存储方式
int ret;
struct rt_trie_node *n;
struct tnode *pn;
unsigned int pos, bits;
t_key key = ntohl(flp->daddr); // 地址作为key (网络字节序[127.0.0.1]转为主机字节序0xc0000001)
unsigned int chopped_off;
t_key cindex = 0;
unsigned int current_prefix_length = KEYLENGTH;
struct tnode *cn;
t_key pref_mismatch;

rcu_read_lock();

n = rcu_dereference(t->trie); // 字典树节点,rcu拷贝数据
if (!n)
goto failed;

#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats.gets++;
#endif

/* Just a leaf? */
//这里判断trie节点是否为叶子,否则为节点
if (IS_LEAF(n)) { //是否是叶子
ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags); //强制转换为--叶子
goto found;
}

pn = (struct tnode *) n; //强制转换为---节点
chopped_off = 0;

while (pn) {
pos = pn->pos; //有效位的开始位置
bits = pn->bits; //有效位的位数

if (!chopped_off) //砍掉
cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
pos, bits); //获取子节点索引,这里的索引为4个字节,其实就是表示一个ip地址

n = tnode_get_child_rcu(pn, cindex); //获取节点tnode的rt_trie_node

if (n == NULL) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats.null_node_hit++;
#endif
goto backtrace;
}

//是否是叶子
if (IS_LEAF(n)) {
ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
if (ret > 0)
goto backtrace;
goto found; //到这里找到一个路由,就直接返回
}

//否则为节点
cn = (struct tnode *)n;

/*
* It's a tnode, and we can do some extra checks here if we
* like, to avoid descending into [落进] a dead-end branch.
* This tnode is in the parent's child array at index
* key[p_pos..p_pos+p_bits] but potentially[可能的] with some bits
* chopped off, so in reality the index may be just a
* subprefix, padded with zero at the end.
* We can also take a look at any skipped bits in this
* tnode - everything up to p_pos is supposed to be ok,
* and the non-chopped bits of the index (se previous
* paragraph) are also guaranteed ok, but the rest is
* considered unknown.
*
* The skipped bits are key[pos+bits..cn->pos].
*/

/* If current_prefix_length < pos+bits, we are already doing
* actual prefix matching, which means everything from
* pos+(bits-chopped_off) onward must be zero along some
* branch of this subtree - otherwise there is *no* valid
* prefix present. Here we can only check the skipped
* bits. Remember, since we have already indexed into the
* parent's child array, we know that the bits we chopped of
* *are* zero.
*/

/* NOTA BENE: Checking only skipped bits
for the new node here */

//溢出
if (current_prefix_length < pos+bits) {
if (tkey_extract_bits(cn->key, current_prefix_length,
cn->pos - current_prefix_length)
|| !(cn->child[0]))
goto backtrace;
}

/*
* If chopped_off=0, the index is fully validated and we
* only need to look at the skipped bits for this, the new,
* tnode. What we actually want to do is to find out if
* these skipped bits match our key perfectly, or if we will
* have to count on finding a matching prefix further down,
* because if we do, we would like to have some way of
* verifying the existence of such a prefix at this point.
*/

/* The only thing we can do at this point is to verify that
* any such matching prefix can indeed be a prefix to our
* key, and if the bits in the node we are inspecting that
* do not match our key are not ZERO, this cannot be true.
* Thus, find out where there is a mismatch (before cn->pos)
* and verify that all the mismatching bits are zero in the
* new tnode's key.
*/

/*
* Note: We aren't very concerned about the piece of
* the key that precede pn->pos+pn->bits, since these
* have already been checked. The bits after cn->pos
* aren't checked since these are by definition
* "unknown" at this point. Thus, what we want to see
* is if we are about to enter the "prefix matching"
* state, and in that case verify that the skipped
* bits that will prevail throughout this subtree are
* zero, as they have to be if we are to find a
* matching prefix.
*/

pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);

/*
* In short: If skipped bits in this node do not match
* the search key, enter the "prefix matching"
* state.directly.
*/
if (pref_mismatch) {
/* fls(x) = __fls(x) + 1 */
int mp = KEYLENGTH - __fls(pref_mismatch) - 1;

if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
goto backtrace;

if (current_prefix_length >= cn->pos)
current_prefix_length = mp;
}

pn = (struct tnode *)n; /* Descend */
chopped_off = 0;
continue;

backtrace:
chopped_off++;

/* As zero don't change the child key (cindex) */
while ((chopped_off <= pn->bits)
&& !(cindex & (1<<(chopped_off-1))))
chopped_off++;

/* Decrease current_... with bits chopped off */
if (current_prefix_length > pn->pos + pn->bits - chopped_off)
current_prefix_length = pn->pos + pn->bits
- chopped_off;

/*
* Either we do the actual chop off according or if we have
* chopped off all bits in this tnode walk up to our parent.
*/

if (chopped_off <= pn->bits) {
cindex &= ~(1 << (chopped_off-1));
} else {
struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
if (!parent)
goto failed;

/* Get Child's index */
cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
pn = parent;
chopped_off = 0;

#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats.backtrack++;
#endif
goto backtrace;
}
}
failed:
ret = 1;
found:
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(fib_table_lookup);

在fib_table_lookup()函数内部主要是完成trie树形存储节点node的匹配、及leaf叶子的校验,关于里面的算法,以我现在的能力还没法完全理顺,这个后续待分析,先留个备注(大概是这样的,一个地址由四个字节构成,即32位,需要开辟一个屏蔽字空间表示每个地址,如地址0.0.0.0表示bit0置1,0.0.0.1表示bit1置1....所以需要2log(KEYLENGTH) bits空间)。里面涉及到对一个重要的函数,如下:

static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
t_key key, const struct flowi4 *flp,
struct fib_result *res, int fib_flags)
{
struct leaf_info *li;
struct hlist_head *hhead = &l->list;

hlist_for_each_entry_rcu(li, hhead, hlist) { //遍历struct leaf_info 的 hhead(叶子) 链表
struct fib_alias *fa;

//key表示的是地址,即下面是地址比较
if (l->key != (key & li->mask_plen)) //判断struct leaf_info 中的mask_plen 与 struct leaf 叶子的地址是否相等
continue;

list_for_each_entry_rcu(fa, &li->falh, fa_list) { //遍历 struct leaf_info list_head链表,即l->list 链表
struct fib_info *fi = fa->fa_info;
int nhsel, err;

if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
continue;
if (fi->fib_dead)
continue;
if (fa->fa_info->fib_scope < flp->flowi4_scope)
continue;
fib_alias_accessed(fa);
err = fib_props[fa->fa_type].error; //通过路由类型获取路由范围和错误状态
if (err) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats.semantic_match_passed++;
#endif
return err;
}
if (fi->fib_flags & RTNH_F_DEAD)
continue;

//遍历路由下一跳的地址
for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
const struct fib_nh *nh = &fi->fib_nh[nhsel];

if (nh->nh_flags & RTNH_F_DEAD) //下一跳的地址是死的,即结束的地址
continue;
if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
continue;

#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats.semantic_match_passed++;
#endif
//返回路由结果
res->prefixlen = li->plen;
res->nh_sel = nhsel;
res->type = fa->fa_type;
res->scope = fa->fa_info->fib_scope;
res->fi = fi;
res->table = tb;
res->fa_head = &li->falh;
if (!(fib_flags & FIB_LOOKUP_NOREF))
atomic_inc(&fi->fib_clntref);
return 0;
}
}

#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats.semantic_match_miss++;
#endif
}

return 1;
}

该函数最终检索到的叶子信息返回给上层。

6. inet_csk_get_port()

可详见另外一篇文章对该函数的分析​

if (sk->sk_prot->get_port(sk, snum)) { //sk->sk_prot指向的是tcp_prot结构体, get_port=inet_csk_get_port()
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
//得到该sock对应协议族的全局的底层容器 hashinfo = tcp_hashinfo ,
//其中它在struct proto tcp_prot内部初始化。而tcp_hashinfo的部分成
//员是在 tcp_init()函数内部初始化,要搞清楚这里的关系,一定要查看
//tcp_init() 函数内部的实现
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_bind_hashbucket *head;
struct inet_bind_bucket *tb;
int ret, attempts = 5;
struct net *net = sock_net(sk);
int smallest_size = -1, smallest_rover;
kuid_t uid = sock_i_uid(sk); //获取当前socket对应的用户id


local_bh_disable();
//端口无效(我们的应用程序在开发的时候配置的无效端口,所以这里会随机
//分配一个),这种情况就是随机绑定一个没有使用的端口
if (!snum) { //端口无效
int remaining, rover, low, high;


again:
inet_get_local_port_range(&low, &high); //获取端口范围,一般就是1到65535,就是我们常用的端口号范围,当然也可以自己配置
remaining = (high - low) + 1; //剩余端口个数
smallest_rover = rover = net_random() % remaining + low; //随机分配一个数字作为端口


smallest_size = -1;
do {
//是否是保留的端口
if (inet_is_reserved_local_port(rover))
goto next_nolock; //如果是保留的端口就切换到下一个,即++rover


//通过端口号,即哈希值,确定其所在的链表head
head = &hashinfo->bhash[inet_bhashfn(net, rover,
hashinfo->bhash_size)];


/* 锁住哈希桶 */
spin_lock(&head->lock);


/* 从头遍历哈希桶,在inet_bind_bucket_for_each函数内部运用了
container_of机制,通过指针成员获取其对应的结构体,这里既是tb*/
inet_bind_bucket_for_each(tb, &head->chain)
/* 如果端口被使用了,就进行冲突检测 */
if (net_eq(ib_net(tb), net) && tb->port == rover) {
if (((tb->fastreuse > 0 && //tb中的参数可“快速重用”
sk->sk_reuse && //socket参数允许快速重用
sk->sk_state != TCP_LISTEN) || //不在监听状态
(tb->fastreuseport > 0 &&
sk->sk_reuseport &&
uid_eq(tb->fastuid, uid))) && //socket用户id相等
(tb->num_owners < smallest_size || smallest_size == -1)) {
smallest_size = tb->num_owners; /* 记下这个端口使用者的个数 */
smallest_rover = rover; /* 记下这个端口 */


/* 如果系统绑定的端口已经很多了,那么就判断端口是否有绑定冲突*/
if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
snum = smallest_rover; /* 没有冲突,使用此端口 */
goto tb_found;
}
}


/* 检查是否有端口绑定冲突,该端口是否能重用 */
if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
snum = rover;
goto tb_found;
}
goto next; /* 此端口不可重用,看下一个 */
}


/* 找到了没被用的端口,退出 */
break; //如果一个桶遍历过了,没有冲突的,那么就需要在下面建立一个inet_bind_bucket
next:
spin_unlock(&head->lock);
next_nolock:
if (++rover > high)
rover = low;
} while (--remaining > 0);


/* Exhausted local port range during search? It is not
* possible for us to be holding one of the bind hash
* locks if this test triggers, because if 'remaining'
* drops to zero, we broke out of the do/while loop at
* the top level, not from the 'break;' statement.
*/
ret = 1;
if (remaining <= 0) {
if (smallest_size != -1) {
snum = smallest_rover;
goto have_snum;
}
goto fail;
}
/* OK, here is the one we will use. HEAD is
* non-NULL and we hold it's mutex.
*/
snum = rover; /* 自动选择的可用端口 */
} else { /* 如果应用层有指定要绑定的端口 */
have_snum: //有端口
/* 走到这里,表示用户已经自己绑定了端口
1. inet_bhashfn(net, snum, hashinfo->bhash_size): 计算struct inet_bind_hashbucket指针索引
2. head = &hashinfo->bhash[*]: 返回struct inet_bind_hashbucket hash桶指针,即端口所在的哈希桶
3. inet_bind_bucket_for_each(tb, &head->chain):遍历当前hash桶内部的chain(hlist)链表,该链表
上注册了已被绑定端口,通过该chain链表及node成员找到(运用container_of)找到所属的结构体,即
结构体为tb (struct inet_bind_bucket),具体的端口绑定到链表详见inet_bind_bucket_create()函
数内部的实现
4. net_eq(ib_net(tb), net) && tb->port == snum: 是否是同一个net[个人理解,这个应该是创建一个socket就对应一个net] && 端口是否相等
*/
head = &hashinfo->bhash[inet_bhashfn(net, snum,
hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == snum) //从hash链表里获取的端口与应用配置的端口相等?
goto tb_found; /* 发现端口在用 */
}
tb = NULL;
goto tb_not_found;
tb_found:
/* 端口上有绑定sock时 */
if (!hlist_empty(&tb->owners)) { //为NULL表示tb未被使用
/* 这是强制的绑定啊,不管端口是否会绑定冲突!*/
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;


//根据socket的参数判断当前的端口是否快速重用
if (((tb->fastreuse > 0 &&
sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
(tb->fastreuseport > 0 &&
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
smallest_size == -1) { /* 指定端口的情况 */
goto success;
} else {
ret = 1;
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { /* 端口绑定冲突 */
/* 自动分配的端口绑定冲突了,再次尝试,最多重试5次。
* 我觉得以下if不必要,因为自动选择时goto tb_found之前都有检测过了。
*/
if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
(tb->fastreuseport > 0 &&
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
smallest_size != -1 && --attempts >= 0) {
spin_unlock(&head->lock);
goto again;
}


goto fail_unlock;
}
}
}
tb_not_found: //到这里表示在hash桶里面没有找到端口
ret = 1;
/* 申请和初始化一个inet_bind_bucket结构, 返回一个tb hash桶*/
if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
net, head, snum)) == NULL)
goto fail_unlock;
if (hlist_empty(&tb->owners)) { //在inet_bind_bucket_create()函数内部tb->owners初始化为NULL
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) //sk->sk_reuse变量在inet_create()函数内部初始化的
tb->fastreuse = 1;
else
tb->fastreuse = 0;
if (sk->sk_reuseport) { //端口重用
tb->fastreuseport = 1;
tb->fastuid = uid; //用户id
} else
tb->fastreuseport = 0;
} else {
if (tb->fastreuse && //重用
(!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) //禁止端口复用 || socket状态为监听
tb->fastreuse = 0; //禁止重用
if (tb->fastreuseport &&
(!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) //端口禁止重用 || 用户id不相等
tb->fastreuseport = 0;
}
success:
/* 赋值icsk中的inet_bind_bucket */
if (!inet_csk(sk)->icsk_bind_hash) //未绑定hash桶, 在下面的 inet_bind_hash()函数内部绑定
inet_bind_hash(sk, tb, snum); //重要,将hash桶绑定到sk->sk_prot->h.hashinfo上
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
ret = 0;


fail_unlock:
spin_unlock(&head->lock);
fail:
local_bh_enable();
return ret;
}
EXPORT_SYMBOL_GPL(inet_csk_get_port);

在inet_csk_get_port()函数内部主要完成以下几种功能

a、端口是否无效(即端口为0),无效就随机分配一个端口, 通过分配的端口,查找对应的hash桶,然后对该申请的端口进行冲突检测(即是否已被申请),如果端口没有被别的地方使用,就以此端口为当前端口,否则继续随机分配...;

b、端口有效,即应用层有指定要绑定的端口,根据该端口确定所在的hash桶,遍历该hash桶链表,并与应用层的端口进行匹配并确定是否在用(即该端口上是否绑定了socket),否则就通过inet_bind_bucket_create()申请一个hash桶

struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
struct net *net,
struct inet_bind_hashbucket *head,
const unsigned short snum)
{
struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);

if (tb != NULL) {
write_pnet(&tb->ib_net, hold_net(net));
tb->port = snum; //绑定端口
tb->fastreuse = 0;
tb->fastreuseport = 0;
tb->num_owners = 0;
INIT_HLIST_HEAD(&tb->owners);
hlist_add_head(&tb->node, &head->chain);
}
return tb;
}

c、最后通过

success:
/* 赋值icsk中的inet_bind_bucket */
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, snum);
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
const unsigned short snum)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; //获取sock的hash链表, hashinfo = tcp_hashinfo

atomic_inc(&hashinfo->bsockets); //hash链表原子锁++

inet_sk(sk)->inet_num = snum; //绑定端口
sk_add_bind_node(sk, &tb->owners); //将tb->owners添加到sk->sk_bind_node节点上
tb->num_owners++; //端口进行累加
inet_csk(sk)->icsk_bind_hash = tb; //将当前的bind-桶绑定到全局变量icsk_bind_hash上
}

进行端口的绑定,即tb添加到sock上。

7. 总结

现在回过头来再看下bind函数主要完成的工作是绑定地址(node节点、leaf叶子)、端口(hash桶)。