select 源码分析

## select(2)，同步的 I/O 复用

直接看 epoll 的源码把自己绕晕了，先整个简单点的下手。

- [使用](#usage)

- [源码分析](#src_analysis)

### select(2) 提供的用户接口

```c

#include <sys/select.h>

int select(int nfds, fd_set *readfds, fd_set *writefds,

fd_set *exceptfds, struct timeval *timeout);

void FD_CLR(int fd, fd_set *set);

int FD_ISSET(int fd, fd_set *set);

void FD_SET(int fd, fd_set *set);

void FD_ZERO(fd_set *set);

```

1. 第 1 个参数为最大的文件描述符加 1

2. 第 2 3 4 个参数依次为读写异常需要检查的结构体

3. 第 5 个参数为超时时间，struct timeval 是一个精确到微秒的时间结构

man(2) 给出的用例

```c

#include <stdio.h>

#include <stdlib.h>

#include <sys/time.h>

#include <sys/types.h>

#include <unistd.h>

int main(void) {

fd_set rfds;

struct timeval tv;

int retval;

/* Watch stdin (fd 0) to see when it has input. */

FD_ZERO(&rfds);

FD_SET(0, &rfds);

/* Wait up to five seconds. */

tv.tv_sec = 5;

tv.tv_usec = 0;

retval = select(1, &rfds, NULL, NULL, &tv);

/* Don't rely on the value of tv now! */

if (retval == -1)

perror("select()");

else if (retval)

printf("Data is available now.\n");

/* FD_ISSET(0, &rfds) will be true. */

else

printf("No data within five seconds.\n");

exit(EXIT_SUCCESS);

}

```

### 源码分析

select(2) 依赖于文件结构 f_op->poll 方法，

1. 在select中循环调用 f_op->poll() 回调 [__pollwait()](#__pollwait)，在 __pollwait() 中设置好唤醒回调函数 pollwake() 再将其投入到文件的等待队列中

2. 获得就绪事件掩码，根据掩码设置就绪的 fd，当文件状态发生会主动调用唤醒函数再遍历等待队列执行队列中设置的唤醒回调函数 [pollwake()](#pollwake)

```c

// fs/kernfs/file.c

// 普通文件中的 poll 操作

const struct file_operations kernfs_file_fops = {

.poll = kernfs_fop_poll,

};

static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)

{

// 最终调用 poll_wait 函数，转而调用 _qproc

poll_wait(filp, &on->poll, wait);

}

```

```c

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)

{

if (p && p->_qproc && wait_address)

p->_qproc(filp, wait_address, p);

}

```

#### 关键的数据结构

```c

// 文件的 poll 操作的数据结构

typedef struct poll_table_struct {

poll_queue_proc _qproc; // 文件 poll 的回调函数

__poll_t _key; // 关注的事件掩码貌似 select 用不到

} poll_table;

// 所有可以投入的等待队列项集合，（可以观察初始化位置）

struct poll_wqueues {

poll_table pt;

struct poll_table_page *table; // 如果 inline_entries 空间不足，则从 poll_table_page 中获取

struct task_struct *polling_task; // 当前运行的任务

int triggered; // 触发标志

int error; // 错误代码

int inline_index; // inline_entries 中第一个未分配的下标

struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES]; // 固定空间

};

// poll_table_entry 的动态内存结构, poll_get_entry()

struct poll_table_page {

struct poll_table_page * next; // 指向旧的 poll_table_entry 内存

struct poll_table_entry * entry; // 指向空闲的 poll_table_entry 节点

struct poll_table_entry entries[0]; // 作指示，初始 entry 指向

};

// 添加到文件的poll等待队列中的结构项，一个文件对应一个poll_table_entry

struct poll_table_entry {

struct file *filp; // 当前文件

__poll_t key; // 对应 poll_table 中的 key

wait_queue_entry_t wait; // 等待队列中的节点

wait_queue_head_t *wait_address; // 等待队列头

};

```

```c

* A single wait-queue entry structure:

struct wait_queue_entry {

unsigned int flags;

void *private;

wait_queue_func_t func; // 等待队列上节点的回调函数

struct list_head entry;

};

struct wait_queue_head {

spinlock_t lock;

struct list_head head;

};

typedef struct wait_queue_entry wait_queue_entry_t;

typedef struct wait_queue_head wait_queue_head_t;

typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);

```

先看对这几个数据结构进行操作的函数

```c

```

#### 1. 先对超时时间做处理

```c

static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,

fd_set __user *exp, struct timeval __user *tvp)

{

struct timespec64 end_time, *to = NULL;

struct timeval tv;

int ret;

if (tvp) {

if (copy_from_user(&tv, tvp, sizeof(tv)))

return -EFAULT;

to = &end_time;

if (poll_select_set_timeout(to,

tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),

(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))

return -EINVAL;

}

ret = core_sys_select(n, inp, outp, exp, to);

ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);

return ret;

}

```

`struct timespec64` 是一个包含秒和纳秒的时间结构，`struct timeval` 的精度为微秒。

如果超时时间存在，则对时间做处理：将超时时间的精度`安全`的从微秒扩展到纳秒。

#### 2. 轮询之前的准备工作

```c

int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,

fd_set __user *exp, struct timespec64 *end_time)

{

fd_set_bits fds;

void *bits;

int ret, max_fds;

size_t size, alloc_size;

struct fdtable *fdt;

/* Allocate small arguments on the stack to save memory and be faster */

long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

ret = -EINVAL;

if (n < 0)

goto out_nofds;

/* max_fds can increase, so grab it once to avoid race */

rcu_read_lock();

fdt = files_fdtable(current->files);

max_fds = fdt->max_fds; // 当前进程上的最大 fd

rcu_read_unlock();

if (n > max_fds)

n = max_fds;

* We need 6 bitmaps (in/out/ex for both incoming and outgoing),

* since we used fdset we need to allocate memory in units of

* long-words.

size = FDS_BYTES(n);

bits = stack_fds; // 优先使用栈上内存，若是内存不够再申请空间

if (size > sizeof(stack_fds) / 6) {

/* Not enough space in on-stack array; must use kmalloc */

ret = -ENOMEM;

if (size > (SIZE_MAX / 6))

goto out_nofds;

alloc_size = 6 * size;

bits = kvmalloc(alloc_size, GFP_KERNEL);

if (!bits)

goto out_nofds;

}

fds.in = bits;

fds.out = bits + size;

fds.ex = bits + 2*size;

fds.res_in = bits + 3*size;

fds.res_out = bits + 4*size;

fds.res_ex = bits + 5*size;

// 将用户态的 rfds wfds efds 复制到内核空间的 fds 上

if ((ret = get_fd_set(n, inp, fds.in)) ||

(ret = get_fd_set(n, outp, fds.out)) ||

(ret = get_fd_set(n, exp, fds.ex)))

goto out;

zero_fd_set(n, fds.res_in); // 在轮询前对 fds 的结果置空

zero_fd_set(n, fds.res_out);

zero_fd_set(n, fds.res_ex);

ret = do_select(n, &fds, end_time); // 核心逻辑

if (ret < 0)

goto out;

if (!ret) {

ret = -ERESTARTNOHAND;

if (signal_pending(current)) // 若当前进程有信号未处理

goto out;

ret = 0;

}

// 将轮询后的结果复制到用户空间

if (set_fd_set(n, inp, fds.res_in) ||

set_fd_set(n, outp, fds.res_out) ||

set_fd_set(n, exp, fds.res_ex))

ret = -EFAULT;

out:

if (bits != stack_fds)

kvfree(bits);

out_nofds:

return ret;

}

```

#### select 核心实现

```c

static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)

{

ktime_t expire, *to = NULL;

struct poll_wqueues table;

poll_table *wait;

int retval, i, timed_out = 0;

u64 slack = 0;

__poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;

unsigned long busy_start = 0;

rcu_read_lock();

retval = max_select_fd(n, fds); // 检查 fds 中fd的有效性，并获得最大的 fd

rcu_read_unlock();

if (retval < 0)

return retval;

n = retval;

poll_initwait(&table); // 初始化 poll_wqueue，见下，实现

wait = &table.pt;

if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { // 超时的判断

wait->_qproc = NULL;

timed_out = 1;

}

if (end_time && !timed_out) // 未超时，计算剩下的时间

slack = select_estimate_accuracy(end_time);

retval = 0;

for (;;) {

unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;

bool can_busy_loop = false;

inp = fds->in; outp = fds->out; exp = fds->ex;

rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

for (i = 0; i < n; ++rinp, ++routp, ++rexp) {

unsigned long in, out, ex, all_bits, bit = 1, j;

unsigned long res_in = 0, res_out = 0, res_ex = 0;

__poll_t mask;

in = *inp++; out = *outp++; ex = *exp++;

all_bits = in | out | ex;

if (all_bits == 0) { // 没有关注的文件

i += BITS_PER_LONG; // 一个 long 可以描述 32 个fd

continue; // 直接下一轮

}

// 这里就开始对每一个 fd 进行判断了

for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {

struct fd f;

if (i >= n)

break;

if (!(bit & all_bits)) // 该fd未被关注

continue;

f = fdget(i); // 获取文件

if (f.file) {

const struct file_operations *f_op;

f_op = f.file->f_op;

mask = DEFAULT_POLLMASK;

if (f_op->poll) {

// 设置回调函数的关注的文件事件掩码

wait_key_set(wait, in, out,

bit, busy_flag);

// 执行文件的 poll 方法，调用 __pollwait(见下实现)，设置等待队列回调函数，返回就绪文件事件掩码

mask = (*f_op->poll)(f.file, wait);

}

fdput(f); // 释放文件

// 接下来就是根据就绪事件掩码将就绪的fd写入对应的 fds 中

if ((mask & POLLIN_SET) && (in & bit)) {

res_in |= bit;

retval++;

wait->_qproc = NULL; // 已经添加过唤醒函数了，所以这里直接置空

}

if ((mask & POLLOUT_SET) && (out & bit)) {

res_out |= bit;

retval++;

wait->_qproc = NULL;

}

if ((mask & POLLEX_SET) && (ex & bit)) {

res_ex |= bit;

retval++;

wait->_qproc = NULL;

}

/* got something, stop busy polling */

if (retval) {

can_busy_loop = false;

busy_flag = 0;

* only remember a returned

* POLL_BUSY_LOOP if we asked for it

} else if (busy_flag & mask)

can_busy_loop = true;

}

if (res_in)

*rinp = res_in;

if (res_out)

*routp = res_out;

if (res_ex)

*rexp = res_ex;

cond_resched();

}

wait->_qproc = NULL;

// 如果有就绪事件或超时或有信号，退出循环

if (retval || timed_out || signal_pending(current))

break;

if (table.error) {

retval = table.error;

break;

}

/* only if found POLL_BUSY_LOOP sockets && not out of time */

if (can_busy_loop && !need_resched()) {

if (!busy_start) {

busy_start = busy_loop_current_time();

continue;

}

if (!busy_loop_timeout(busy_start))

continue;

}

busy_flag = 0;

* If this is the first loop and we have a timeout

* given, then we convert to ktime_t and set the to

* pointer to the expiry value.

if (end_time && !to) {

expire = timespec64_to_ktime(*end_time);

to = &expire;

}

// 等待超时，并且将超时标志置 1，将再次经过一次循环

if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,

to, slack))

timed_out = 1;

}

poll_freewait(&table);

return retval;

}

```

先看 poll_initwait() 实现

```c

void poll_initwait(struct poll_wqueues *pwq)

{

// init_poll_funcptr(&pwq->pt, __pollwait);

pwq->pt->_qproc = __pollwait; // 设置 poll_table 的回调函数为 __poll_wait()

pwq->pt->_key = ~(__poll_t)0; /* all events enabled */

pwq->polling_task = current; // 指向当前的任务

pwq->triggered = 0; // 未触发

pwq->error = 0; // 无错误

pwq->table = NULL; // 无动态内存分配

pwq->inline_index = 0; // table_entry 未使用

}

```

再看 __pollwait() 实现

```c

/* Add a new entry */

static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,

poll_table *p)

{

struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);

struct poll_table_entry *entry = poll_get_entry(pwq);

if (!entry)

return;

entry->filp = get_file(filp); // 设置文件

entry->wait_address = wait_address; // 设置等待队列头

entry->key = p->_key; // 设置关注的事件

init_waitqueue_func_entry(&entry->wait, pollwake); // 设置对待队列节点的回调函数为 pollwake()

entry->wait.private = pwq; // 私有数据 poll_wqueues

add_wait_queue(wait_address, &entry->wait); // 将 poll_table_entry 添加到文件的等待队列上

}

```

poll_get_entry() 的实现就不贴了，就是数组空间够的话取数组空间，数组空间不够申请内存页，新的内存页采用头插法插入到 poll_wqueues->table 中

add_wait_queue() 就是一个将等待节点条目 wait_queue_entry_t 添加到 wait_queue_head_t 中，这里的 head 是 file->f_op->poll() 传来的poll头节点

再看 pollwake(), 最主要的工作是唤醒线程

```c

static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)

{

struct poll_table_entry *entry;

entry = container_of(wait, struct poll_table_entry, wait);

if (key && !(key_to_poll(key) & entry->key))

return 0;

return __pollwake(wait, mode, sync, key);

}

static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)

{

struct poll_wqueues *pwq = wait->private;

DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

* Although this function is called under waitqueue lock, LOCK

* doesn't imply write barrier and the users expect write

* barrier semantics on wakeup functions. The following

* smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()

* and is paired with smp_store_mb() in poll_schedule_timeout.

smp_wmb();

pwq->triggered = 1; // 设置触发的标志

* Perform the default wake up operation using a dummy

* waitqueue.

* TODO: This is hacky but there currently is no interface to

* pass in @sync. @sync is scheduled to be removed and once

* that happens, wake_up_process() can be used directly.

return default_wake_function(&dummy_wait, mode, sync, key); // 唤醒，key 未使用

}

```

现在只有这些等待节点回调的实现，文件状态发生变化时会调用等待队列节点的回调函数也就是 pollwake (wait_queue_func_t), 看实现

```c

// 直接看唤醒的核心逻辑

// kernel/sched/wait.c

static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,

int nr_exclusive, int wake_flags, void *key,

wait_queue_entry_t *bookmark)

{

wait_queue_entry_t *curr;

// 遍历文件 wait_queue_head 执行回调函数

list_for_each_entry_safe_from(curr, next, &wq_head->head, entry)

ret = curr->func(curr, mode, wake_flags, key);

}

```

### 杂项

1. 为何 select 最大只能关注 1024 个文件描述符

```c

/* fd_set for select and pselect. */

#define __FD_SETSIZE 1024

/* The fd_set member is required to be an array of longs. */

typedef long int __fd_mask;

/* It's easier to assume 8-bit bytes than to get CHAR_BIT. */

#define __NFDBITS (8 * (int)sizeof(__fd_mask)) // 每 long int 的位数 32

#define __FD_ELT(d) ((d) / __NFDBITS)

#define __FD_MASK(d) ((__fd_mask)1 << ((d) % __NFDBITS)) // 取模进行左移

typedef struct {

__fd_mask __fds_bits[__FD_SETSIZE / __NFDBITS];

#define __FDS_BITS(set) ((set)->__fds_bits)

} fd_set;

#define __FD_SET(d, set) ((void)(__FDS_BITS(set)[__FD_ELT(d)] |= __FD_MASK(d)))

#define __FD_CLR(d, set) ((void)(__FDS_BITS(set)[__FD_ELT(d)] &= ~__FD_MASK(d)))

#define __FD_ISSET(d, set) ((__FDS_BITS(set)[__FD_ELT(d)] & __FD_MASK(d)) != 0)

```

如上所示，fd_set 是一个整形数组，大小为32，观察以上的宏实现，一个整形可以存 32 个 fd，每个fd对应一个 bit，故宏__FD_MASK() 是找到该fd对应哪一位，而__FD_ELT()对应fd处于哪一个 long 单位中，相当于一个二维数组了。

__FDS_BITS(set)[__FD_ELT(d)] 就是对该fd的访问了。

2. 为何 select 的第一个参数要在最大的 fd 中加 1.

这是select设计的问题，将第一个参数即当作fd又作为长度来分配空间，且轮询的fd是从 0(stdin) 开始的，所以要加 1

select 源码分析的更多相关文章

【MyBatis源码分析】select源码分析及小结
示例代码之前的文章说过,对于MyBatis来说insert.update.delete是一组的,因为对于MyBatis来说它们都是update:select是一组的,因为对于MyBatis来说它就是 ...
select源码分析（linux2.6.11）
本文以tcp poll为例子来分析select的源码,下面是函数调用顺序.select--->sys_select->do_select--->sock_poll--->tcp ...
eventfd(2) 结合 select(2) 源码分析
eventfd(2) 结合 select(2) 源码分析本文代码选自内核 4.17 eventfd(2) - 创建一个文件描述符用于事件通知. 使用源码分析参考 #include <sys ...
poll(2) 源码分析
poll(2) poll(2) 系统调用的功能和 select(2) 类似:等待一个文件集合中的文件描述符就绪进行I/O操作. 使用实现 select(2) 的局限性: 关注的文件描述符集合大小最大 ...
Netty源码分析第2章(NioEventLoop)---->第6节: 执行select操作
Netty源码分析第二章: NioEventLoop 第六节: 执行select操作分析完了selector的创建和优化的过程, 这一小节分析select相关操作跟到跟到select操作的入口 ...
Linq基础操作之Select,Where,OrderBy,ThenBy源码分析
Linq基础操作之Select,Where,OrderBy,ThenBy源码分析二:Select 它是延迟执行.yield有得一拼,因为他们都是生成了一个枚举类. if (source is TSo ...
【Java】NIO中Selector的select方法源码分析
该篇博客的有些内容和在之前介绍过了,在这里再次涉及到的就不详细说了,如果有不理解请看[Java]NIO中Channel的注册源码分析, [Java]NIO中Selector的创建源码分析 Select ...
python基础-11 socket，IO多路复用，select伪造多线程，select读写分离。socketserver源码分析
Socket socket通常也称作"套接字",用于描述IP地址和端口,是一个通信链的句柄,应用程序通常通过"套接字"向网络发出请求或者应答网络请求. sock ...
LinqToDB 源码分析——生成表达式树
当我们知道了Linq查询要用到的数据库信息之后.接下就是生成对应的表达式树.在前面的章节里面笔者就已经介绍过.生成表达式树是事实离不开IQueryable<T>接口.而处理表达式树离不开I ...

随机推荐

SoapUI使用教程---简介、下载、破解
最近项目中要使用到SoapUI这款测试工具,之前有接触过,但好久没用了,借此机会记录并和大家分享一下. 一.soapui简介 soapui是一款开源测试工具,通过soap/http来检查.调用.实现W ...
hdu6703_array
题意给定一个1到\(n\)的全排列,两种操作,将\(a_{pos}\)修改为\(a_{pos}+1000000\),询问第一个大于等于\(k\)的且不在\(a_1...a_r\)的数. 分析由于\ ...
常见rpm包和yum包命令
1.rpm包在安装.升级.卸载服务程序时要考虑到其他程序.库的依赖关系,在进行校验.安装. 卸载.查询.升级等管理软件操作时难度都非常大. RPM 机制则为解决这些问题而设计的.RPM 有点像 W ...
Nacos（九）：Nacos集群部署和遇到的问题
前言前面的系列文章已经介绍了Nacos的如何接入SpringCloud,以及Nacos的基本使用方式之前的文章中都是基于单机模式部署进行讲解的,本文对Nacos的集群部署方式进行说明环境准备 J ...
Kafka集群环境配置
Kafka集群环境配置 1 环境准备 1.1 集群规划 Node02 Node03 Node04 zk zk zk kafka kafka kafka 1.2 jar包下载安装包:kafka_2.1 ...
unity之初级工程师
一.值类型与引用类型值类型的变量直接存储数据,而引用类型的变量持有的是数据的引用,数据存储在数据堆中.值类型变量声明后,不管是否已经赋值,编译器为其分配内存.值类型的实例通常是在线程栈上分配的(静态 ...
gin-jwt对API进行权限控制
前言之前文章简单介绍了如何运行gin+vue的前后端分离开源项目,该项目是学习了Gin实践教程后结合vue-element-admin写的,该教程讲得很详细,适合入门Gin.本篇文章将介绍gin+v ...
HDU-3507Print Article 斜率优化DP
学习:https://blog.csdn.net/bill_yang_2016/article/details/54667902 HDU-3507 题意:有若干个单词,每个单词有一个费用,连续的单词组 ...
codeforces 389 D. Fox and Minimal path（构造+思维）
题目链接:https://vjudge.net/contest/175446#problem/J 题解:显然要用最多n个点构成的图要使的得到的最短路条数有1e9次个,显然要有几个数相乘容易想到2的几进 ...
Good Inflation SPOJ - GOODG 李超树
题目传送门题意:刚开始有一个气球体积为空,现在有n个充气点,从1->n遍历这n充气点,每个充气点有vi,di,vi为走到这个充气点之后可以为气球充气vi的体积,di为选择了在这个点充气的时候, ...

select 源码分析

select 源码分析的更多相关文章

随机推荐

热门专题