poll(2) 源码分析

poll(2)

poll(2) 系统调用的功能和 select(2) 类似：等待一个文件集合中的文件描述符就绪进行I/O操作。

使用
实现

select(2) 的局限性：

关注的文件描述符集合大小最大只有 1024
文件描述符集合为顺序的，不能任意指定 fd，浪费占用的fd

poll(2) 对 select(2) 的改进，关注的文件描述符集合为动态大小，文件描述可以任意指定。

struct pollfd {

       int   fd;         /* file descriptor */

       short events;     /* requested events */

       short revents;    /* returned events */

};

- fd 为关注的文件描述符

- events 为关注的事件（输入），使用位掩码来表示事件

- revents 为就绪的事件（输出），同样使用位掩码表示

#include <poll.h>

int poll(struct pollfd *fds, nfds_t nfds, int timeout);

- \fds 为文件描述符集合的地址

- \nfds 为文件描述符集合的长度

- \timeout 为超时的时间，单位为 毫秒

返回值为 revents 不为 0 的个数，出错返回 -1

一个简单的例子：等待标准输入就绪，超时时间为3s。

#include <poll.h>

#include <unistd.h>

#include <stdio.h>

int main()

{

        int timeout = 3000;

        struct pollfd fds = {0};

        fds.events |= POLLIN;  // fd = 0 等待标准输入

        int ret = poll(&fds, 1, timeout);

        if (ret == -1)

                printf("error poll\n");

        else if (ret)

                printf("data is avaliable now.\n");

        else

                printf("no data within 3000 ms.\n");

}

实现

代码位于在 fs/select.c 中，参考中的链接有一些关于文件回调和poll结构的说明

poll()

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,

                int, timeout_msecs)

{

        struct timespec64 end_time, *to = NULL;

        int ret;

        if (timeout_msecs >= 0) {

                to = &end_time;

                poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,

                        NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));

        }

        ret = do_sys_poll(ufds, nfds, to);

        if (ret == -EINTR) {

                struct restart_block *restart_block;

                restart_block = &current->restart_block;

                restart_block->fn = do_restart_poll;

                restart_block->poll.ufds = ufds;

                restart_block->poll.nfds = nfds;

                if (timeout_msecs >= 0) {

                        restart_block->poll.tv_sec = end_time.tv_sec;

                        restart_block->poll.tv_nsec = end_time.tv_nsec;

                        restart_block->poll.has_timeout = 1;

                } else

                        restart_block->poll.has_timeout = 0;

                ret = -ERESTART_RESTARTBLOCK;

        }

        return ret;

}

poll() 代码很简单:

处理超时时间
实现 poll(2)
处理后事：判断是否超时或者重新调用。

do_sys_poll()



static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,

                struct timespec64 *end_time)

{

        struct poll_wqueues table;

         int err = -EFAULT, fdcount, len, size;

        /* Allocate small arguments on the stack to save memory and be

           faster - use long to make sure the buffer is aligned properly

           on 64 bit archs to avoid unaligned access */

        long stack_pps[POLL_STACK_ALLOC/sizeof(long)];  // 256 字节大小

        struct poll_list *const head = (struct poll_list *)stack_pps;

         struct poll_list *walk = head;

         unsigned long todo = nfds;

        if (nfds > rlimit(RLIMIT_NOFILE))  // 最大打开的文件数量限制

                return -EINVAL;

        // N_STACK_PPS = (256 - 16) / 8 = 30, 栈空间可以保存 30 个pollfd结构

        // 将用户空间的 struct pollfd 部分移动至栈空间内的数组中

        len = min_t(unsigned int, nfds, N_STACK_PPS);

        for (;;) {

                walk->next = NULL;

                walk->len = len;

                if (!len)

                        break;

                if (copy_from_user(walk->entries, ufds + nfds-todo,

                                        sizeof(struct pollfd) * walk->len))

                        goto out_fds;

                todo -= walk->len;

                if (!todo)

                        break;

                // POLLFD_PER_PAGE = (4096 - 16) / 8 = 510

                // 申请页，每页可容纳 510 个 pollfd 结构

                len = min(todo, POLLFD_PER_PAGE);

                size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;

                walk = walk->next = kmalloc(size, GFP_KERNEL);

                if (!walk) {

                        err = -ENOMEM;

                        goto out_fds;

                }

        }

        // 将所有的pollfd 结构移动至以 head 为首地址的内核空间中

        poll_initwait(&table);  // 初始化 table，详见 select 中的分析，见下参考

        fdcount = do_poll(head, &table, end_time);

        poll_freewait(&table);  // 释放 table

        // 将 revents 复制到用户空间

        for (walk = head; walk; walk = walk->next) {

                struct pollfd *fds = walk->entries;

                int j;

                for (j = 0; j < walk->len; j++, ufds++)

                        if (__put_user(fds[j].revents, &ufds->revents))

                                goto out_fds;

          }

        err = fdcount;

out_fds:

        walk = head->next;

        while (walk) {

                struct poll_list *pos = walk;

                walk = walk->next;

                kfree(pos);

        }

        return err;

}

do_sys_poll() 函数也是分为三步实现

将用户空间的数据复制到内核空间
调用核心实现 do_poll()
将就绪的事件数据从内核空间复制到用户空间

do_poll()

static int do_poll(struct poll_list *list, struct poll_wqueues *wait,

                   struct timespec64 *end_time)

{

        poll_table* pt = &wait->pt;

        ktime_t expire, *to = NULL;

        int timed_out = 0, count = 0;

        u64 slack = 0;

        __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;

        unsigned long busy_start = 0;

        /* Optimise the no-wait case */

        if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {

                pt->_qproc = NULL;

                timed_out = 1;

        }

        if (end_time && !timed_out)

                slack = select_estimate_accuracy(end_time);  // 估算进程等待的时间，函数返回 纳秒

        for (;;) {

                struct poll_list *walk;

                bool can_busy_loop = false;

                for (walk = list; walk != NULL; walk = walk->next) {

                        struct pollfd * pfd, * pfd_end;

                        pfd = walk->entries;

                        pfd_end = pfd + walk->len;

                        for (; pfd != pfd_end; pfd++) {  // 对所有的 struct pollfd 遍历处理，do_pollfd 为单独处理一个 fd 的函数

                                /*

                                 * Fish for events. If we found one, record it

                                 * and kill poll_table->_qproc, so we don't

                                 * needlessly register any other waiters after

                                 * this. They'll get immediately deregistered

                                 * when we break out and return.

                                 */

                                if (do_pollfd(pfd, pt, &can_busy_loop,

                                              busy_flag)) {

                                        count++;

                                        pt->_qproc = NULL;

                                        /* found something, stop busy polling */

                                        busy_flag = 0;

                                        can_busy_loop = false;

                                }

                        }

                }

                /*

                 * All waiters have already been registered, so don't provide

                 * a poll_table->_qproc to them on the next loop iteration.

                 */

                pt->_qproc = NULL;

                if (!count) {

                        count = wait->error;

                        if (signal_pending(current))

                                count = -EINTR;

                }

                if (count || timed_out)

                        break;

                /* only if found POLL_BUSY_LOOP sockets && not out of time */

                if (can_busy_loop && !need_resched()) {

                        if (!busy_start) {

                                busy_start = busy_loop_current_time();

                                continue;

                        }

                        if (!busy_loop_timeout(busy_start))

                                continue;

                }

                busy_flag = 0;

                /*

                 * If this is the first loop and we have a timeout

                 * given, then we convert to ktime_t and set the to

                 * pointer to the expiry value.

                 */

                if (end_time && !to) {

                        expire = timespec64_to_ktime(*end_time);

                        to = &expire;

                }

                if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))  // 调度直到超时

                        timed_out = 1;

        }

        return count;

}

这个函数写的很清楚了，也有很多注释

can_busy_loop 是和 CONFIG_NET_RX_BUSY_POLL 配置相关的，不算通用处理情况，先忽略不考虑
count 为函数的返回值，在 do_pollfd 有返回匹配的掩码时递增，为就绪的文件描述符数量，无就绪文件的时候为等待队列中的错误码
pt->_qproc 为文件poll操作调用的函数，= NULL 的操作在注释中已经说明，函数已经注册到队列中，不必再次注册. 这个函数相关的内容可以在另外一篇 select(2) 找到具体的说明

/*

 * Fish for events. If we found one, record it and kill poll_table->_qproc, so we don't

 * needlessly register any other waiters after this. They'll get immediately deregistered

 * when we break out and return.

 */

/*

 * All waiters have already been registered, so don't provide a poll_table->_qproc to them on the next loop iteration.

 */

do_pollfd()

/*

 * Fish for pollable events on the pollfd->fd file descriptor. We're only

 * interested in events matching the pollfd->events mask, and the result

 * matching that mask is both recorded in pollfd->revents and returned. The

 * pwait poll_table will be used by the fd-provided poll handler for waiting,

 * if pwait->_qproc is non-NULL.

 */

static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,

                                     bool *can_busy_poll,

                                     __poll_t busy_flag)

{

        __poll_t mask;

        int fd;

        mask = 0;

        fd = pollfd->fd;

        if (fd >= 0) {

                struct fd f = fdget(fd);

                mask = EPOLLNVAL;  // 0x20

                if (f.file) {

                        /* userland u16 ->events contains POLL... bitmap */

                        // 设置关注的事件

                        __poll_t filter = demangle_poll(pollfd->events) |

                                                EPOLLERR | EPOLLHUP;

                        mask = DEFAULT_POLLMASK;  // (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)

                        if (f.file->f_op->poll) {

                                pwait->_key = filter;

                                pwait->_key |= busy_flag;  // key 在唤醒函数的时候用到

                                mask = f.file->f_op->poll(f.file, pwait);  // 获取就绪的文件掩码

                                if (mask & busy_flag)

                                        *can_busy_poll = true;

                        }

                        /* Mask out unneeded events. */

                        mask &= filter;  // 将文件返回的事件掩码与关注的事件做与操作得到 关注的就绪事件掩码

                        fdput(f);

                }

        }

        /* ... and so does ->revents */

        pollfd->revents = mangle_poll(mask);  // 设置就绪掩码

        return mask;

}

讨论在不考虑错误的情况下，

poll(2) 返回的是revents 非 0 的个数，在 do_pollfd() 中返回一个非 0 的 mask，poll(2) 返回的 count 就 +1。

mask = 0 有两种可能：

和 filter 做与运算，但是这样做有一个前提就是可以取到 fd
fd < 0，这种属于无意义的fd了，属于用户的问题

在已了解的fd中： eventfd 和普通的文件poll函数返回情况

EPOLLIN 或者 EPOLLOUT 或两个都存在
(EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)

当关注的事件不在以上事件中，是可能返回 0，而count不增加的

struct pollfd fds[n];

rn = poll(fds, n, 0);

for (int i = 0; i < rn; ++i)

        if (fds[i].revents ...)

像上面这种操作是有风险的，会访问不到rn之后的fd。

mangle_poll() 设置就绪掩码

展开一下就绪掩码的设置函数， __MAP 函数有点绕，大概就是将 v & from 转换至靠近 to 大小的数值，没太明白为什么这么做。在 4.17 内核中 POLLIN 和 EPOLLIN 这类宏定义大小是一样的。

#define __MAP(v, from, to) \

        (from < to ? (v & from) * (to/from) : (v & from) / (from/to))

static inline __poll_t demangle_poll(u16 val) {

    return (__force __poll_t)__MAP(val, POLLIN, (__force __u16)EPOLLIN) |

           (__force __poll_t)__MAP(val, POLLOUT, (__force __u16)EPOLLOUT) |

           (__force __poll_t)__MAP(val, POLLPRI, (__force __u16)EPOLLPRI) |

           (__force __poll_t)__MAP(val, POLLERR, (__force __u16)EPOLLERR) |

           (__force __poll_t)__MAP(val, POLLNVAL, (__force __u16)EPOLLNVAL) |

           (__force __poll_t)__MAP(val, POLLRDNORM,

                                   (__force __u16)EPOLLRDNORM) |

           (__force __poll_t)__MAP(val, POLLRDBAND,

                                   (__force __u16)EPOLLRDBAND) |

           (__force __poll_t)__MAP(val, POLLWRNORM,

                                   (__force __u16)EPOLLWRNORM) |

           (__force __poll_t)__MAP(val, POLLWRBAND,

                                   (__force __u16)EPOLLWRBAND) |

           (__force __poll_t)__MAP(val, POLLHUP, (__force __u16)EPOLLHUP) |

           (__force __poll_t)__MAP(val, POLLRDHUP, (__force __u16)EPOLLRDHUP) |

           (__force __poll_t)__MAP(val, POLLMSG, (__force __u16)EPOLLMSG);

}

参考

select 源码分析，上一篇写的关于 select 的分析，有一些关于 poll 结构和文件回调的分析。