Linux System Programming 学习笔记(四) 高级I/O

1. Scatter/Gather I/O

a single system call to read or write data between single data stream and multiple buffers

This type of I/O is so named because the data is scattered into or gathered from the given vector of buffers

Scatter/Gather I/O 相比于 C标准I/O有如下优势：

(1)更自然的编码模式

如果数据是自然分隔的(预先定义的结构体字段)，那么 Scatter/Gather I/O 更直观

(2)效率高

A single vectored I/O operation can replace multiple linear I/O operations

(3)性能好

减少了系统调用数

(4)原子性

Scatter/Gather I/O 是原子的，而 multiple linear I/O operations 则有多个进程交替I/O的风险

#include <sys/uio.h>

struct iovec {

       void *iov_base;    /* pointer to start of buffer */

       size_t iov_len;    /* size of buffer in bytes */

};

Both functions always operate on the segments in order, starting with iov[0], then iov[1], and so on, through iov[count-1].

/* The readv() function reads count segments from the file descriptor fd into the buffers described by iov */

ssize_t readv (int fd, const struct iovec *iov, int count);

/* The writev() function writes at most count segments from the buffers described by iov into the file descriptor fd */

ssize_t writev (int fd, const struct iovec *iov, int count);

注意：在Scatter/Gather I/O操作过程中，内核必须分配内部数据结构来表示每个buffer分段，正常情况下，是根据分段数count进行动态内存分配的，

但是当分段数count较小时(一般<=8)，内核直接在内核栈上分配，这显然比在堆中动态分配要快

#include <stdio.h>

#include <unistd.h>

#include <sys/types.h>

#include <sys/stat.h>

#include <fcntl.h>

#include <string.h>

#include <sys/uio.h>

int main(int argc, char* argv[])

{

    struct iovec iov[];

    char* buf[] = {

        "The term buccaneer comes from the word boucan.\n",

        "A boucan is a wooden frame used for cooking meat.\n",

        "Buccaneer is the West Indies name for a pirate.\n" };

    int fd = open("wel.txt", O_WRONLY | O_CREAT | O_TRUNC);

    if (fd == -) {

        fprintf(stderr, "open error\n");

        return ;

    }

    /* fill out three iovec structures */

    for (int i = ; i < ; ++i) {

        iov[i].iov_base = buf[i];

        iov[i].iov_len  = strlen(buf[i]) + ;

    }

    /* with a single call, write them out all */

    ssize_t nwrite = writev(fd, iov, );

    if (nwrite == -) {

        fprintf(stderr, "writev error\n");

        return ;

    }

    fprintf(stdout, "wrote %d bytes\n", nwrite);

    if (close(fd)) {

        fprintf(stdout, "close error\n");

        return ;

    }

    return ;

}

#include <stdio.h>

#include <unistd.h>

#include <string.h>

#include <fcntl.h>

#include <sys/uio.h>

#include <sys/stat.h>

int main(int argc, char* argv[])

{

    char foo[], bar[], baz[];

    struct iovec iov[];

    int fd = open("wel.txt", O_RDONLY);

    if (fd == -) {

        fprintf(stderr, "open error\n");

        return ;

    }

    /* set up our iovec structures */

    iov[].iov_base = foo;

    iov[].iov_len =  sizeof(foo);

    iov[].iov_base = bar;

    iov[].iov_len =  sizeof(bar);

    iov[].iov_base = baz;

    iov[].iov_len =  sizeof(baz);

    /* read into the structures with a single call */

    ssize_t nread = readv(fd, iov, );

    if (nread == -) {

        fprintf(stderr, "readv error\n");

        return ;

    }

    for (int i = ; i < ; ++i) {

        fprintf(stdout, "%d: %s", i, (char*)iov[i].iov_base);

    }

    if (close(fd)) {

        fprintf(stderr, "close error\n");

        return ;

    }

    return ;

}

writev的简单实现：

#include <unistd.h>

#include <sys/uio.h>

ssize_t my_writev(int fd, const struct iovec* iov, int count)

{

    ssize_t ret = ;

    for (int i = ; i < count; ++i) {

        ssize_t nr = write(fd, iov[i].iov_base, iov[i].iov_len);

        if (nr == -) {

            if (errno == EINTR)

                continue;

            ret -= ;

            break;

        }

        ret += nr;

    }

    return nr;

}

In fact, all I/O inside the Linux kernel is vectored; read() and write() are implemented as vectored I/O with a vector of only one segment

2. epoll

poll 和 select 每次调用都需要监测整个文件描述符集，当描述符集增大时，每次调用对描述符集全局扫描就成了性能瓶颈

(1) creating a new epoll instance

/* A successful call to epoll_create1() instantiates a new epoll instance and returns a file descriptor associated with the instance */

#include <sys/epoll.h>

int epoll_create(int size);

parameter size used to provide a hint about the number of file descriptors to be watched;

nowadays the kernel dynamically sizes the required data structures and this parameter just needs to be greater than zero

(2) controling epoll

/* The epoll_ctl() system call can be used to add file descriptors to and remove file descriptors from a given epoll context */

#include <sys/epoll.h>

int epoll_ctl(int epfd, int op, int fd, struct epoll_event* event);

struct epoll_event {

    __u32 events;    /* events */

    union {

              void* ptr;

              int     fd;

              __u32 u32;

              __u64 u64;

    } data;

};

a. op parameter

EPOLL_CTL_ADD   // Add a monitor on the file associated with the file descriptor fd to the epoll instance associated with epfd

EPOLL_CTL_DEL  // Remove a monitor on the file associated with the file descriptor fd from the epoll instance associated with epfd

EPOLL_CTL_MOD  //  Modify an existing monitor of fd with the updated events specified by event

b. event parameter

EPOLLET     // Enables edge-triggered behavior for the monitor of the file ,The default behavior is level-triggered

EPOLLIN     // The file is available to be read from without blocking

EPOLLOUT  // The file is available to be written to without blocking

对于结构体struct epoll_event 里的data成员，通常做法是将data联合体里的fd设置为第二个参数fd，即 event.data.fd = fd

To add a new watch on the file associated with fd to the epoll instance epfd :

#include <sys/epoll.h>

struct epoll_event event;

event.data.fd = fd;

event.events = EPOLLIN | EPOLLOUT

int ret = epll_ctl(epfd, EPOLL_CTL_ADD, fd, &event);

if (ret) {

    fprintf(stderr, "epll_ctl error\n");

}

To modify an existing event on the file associated with fd on the epoll instance epfd :

#include <sys/epoll.h>

struct epoll_event event;

event.data.fd = fd;

event.events = EPOLLIN;

int ret = epoll_ctl(epfd, EPOLL_CTL_MOD, fd, &event);

if (ret) {

    fprintf(stderr, "epoll_ctl error\n");

}

To remove an existing event on the file associated with fd from the epoll instance epfd :

#include <sys/epoll.h>

struct epoll_event event;

int ret = epoll_ctl(epfd, EPOLL_CTL_DEL, fd, &event);

if (ret) {

    fprintf(stderr, "epoll_ctl error\n");

}

(3) waiting for events with epoll

#include <sys/epoll.h>

int epoll_wait(int epfd, struct epoll_event* events, int maxevents, int timeout);

The return value is the number of events, or −1 on error

#include <sys/epoll.h>

#define MAX_EVENTS 64

struct epoll_event* events = malloc(sizeof(struct epoll_event) * MAX_EVENTS);

if (events == NULL) {

    fprintf(stdout, "malloc error\n");

    return ;

}

int nready = epoll_wait(epfd, events, MAX_EVENTS, -);

if (nready < ) {

    fprintf(stderr, "epoll_wait error\n");

    free(events);

    return ;

}

for (int i = ; i < nready; ++i) {

    fprintf(stdout, "event=%ld on fd=%d\n", events[i].events, events[i].data.fd);

    /* we now can operate on events[i].data.fd without blocking */

}

free(events);

(4) Level-Triggered vs. Edge-Triggered

考虑一个生产者和一个消费者通过Unix 管道通信：

a. 生产者向管道写 1KB数据

b. 消费者在管道上执行 epoll_wait, 等待管道中数据可读

如果是水平触发，epoll_wait 调用会立即返回，表明管道已经读操作就绪

如果是边缘触发，epoll_wait 调用不会立即返回，直到步骤a 发生，也就是说，即使管道在eoll_wait调用的时候是可读的，该调用也不会立即返回直至数据已经写到管道。

select和epoll的默认工作方式是水平触发，边缘触发通常是应用于 non-blocking I/O, 并且必须小心检查EAGAIN

3. Mapping Files into Memory

/* A call to mmap() asks the kernel to map len bytes of the object represented by the file  descriptor fd, 
   starting at offset bytes into the file, into memory
*/

#include <sys/mman.h>

void* mmap(void* addr, size_t len, int prot, int flags, int fd, off_t offset);

void* ptr = mmap(, len, PROT_READ, MAP_SHARED, fd, );

参数addr和offset必须是内存页大小的整数倍

如果文件大小不是内存页大小的整数倍，那么就存在填补为0的空洞，读这段区域将返回0，写这段区域将不会影响到被映射的文件内容

Because addr and offset are usually 0, this requirement is not overly difficult to meet

int munmap (void *addr, size_t len);

munmap() removes any mappings that contain pages located anywhere in the process address space starting at addr,

which must be page-aligned, and continuing for len bytes

#include <stdio.h>

#include <unistd.h>

#include <sys/types.h>

#include <sys/stat.h>

#include <sys/mman.h>

#include <fcntl.h>

int main(int argc, char* argv[])

{

    if (argc < ) {

        fprintf(stderr, "usage:%s <file>\n", argv[]);

        return ;

    }

    int fd = open(argv[], O_RDONLY);

    if (fd == -) {

        fprintf(stderr, "open error\n");

        return ;

    }

    struct stat sbuf;

    if (fsat(fd, &sbuf) == -) {

        fprintf(stderr, "fstat error\n");

        return ;

    }

    if (!S_ISREG(sbuf.st_mode)) {

        fprintf(stderr, "%s is not a file\n", argv[]);

        return ;

    }

    void* ptr = mmap(, sbuf.st_size, PROT_READ, MAP_SHARED, fd, );

    if (ptr == MAP_FAILED) {

        fprintf(stderr, "mmap error\n");

        return ;

    }

    if (close(fd)) {

        fprintf(stderr, "close error\n");

        return ;

    }

    for (int i = ; i < sbuf.st_size; ++i) {

        fputc(ptr[i], stdout);

    }

    if (munmap(ptr, sbuf.st_size) == -) {

        fprintf(stderr, "munmap error\n");

        return ;

    }

    return ;

}

mmap的优点：

(1) 读写内存映射文件可以避免 read 和 write 系统调用发生的拷贝

(2) 除了潜在的内存页错误，读写内存映射文件不会有 read 和 write 系统调用时的上下文切换开销

(3) 当多个进程映射相同内存时，内存数据可以在多个进程之间共享

(4) 内存映射定位时只需要平常的指针操作，而不需要 lseek 系统调用

mmap的缺点：

(1) 内存映射必须是页大小的整数倍，通常在调用返回的内存页和被映射的实际文件之间存在剩余空间，例如内存页大小是4KB，那么映射7Byte文件将浪费4089Byte内存

(2) 内存映射必须适合进程的地址空间，32位地址空间，如果有大量的内存映射，将产生碎片，将很难寻找到连续的大区域

(3) 创建和维护内存映射和内核相关数据结构有一定开销

文件和映射内存之间的同步：

#include <sys/mman.h>

int msync (void *addr, size_t len, int flags);

msync() flushes back to disk any changes made to a file mapped via mmap(), synchronizing the mapped file with the mapping

Without invocation of msync(), there is no guarantee that a dirty mapping will be written back to disk until the file is unmapped

4. 同步异步

5. I/O调度和I/O性能

a single disk seek can average over 8 milliseconds，25 million times longer than a single processor cycle!

I/O schedulers perform two basic operations: merging and sorting：

Merging is the process of taking two or more adjacent I/O requests and combining them into a single request

假设一个请求读磁盘块5，另一个请求读磁盘块6和7，调度器可以合并成单一请求读磁盘块5到7，这将减少一半的I/O操作数

Sorting is the process of arranging pending I/O requests in ascending block order (尽可能保证顺序读写，而不是随机读写)

If an I/O scheduler always sorted new requests by the order of insertion，it would be possible to starve requests to

far-off blocks indefinitely (I/O调度算法，避免饥饿现象)

使用电梯调度算法避免某个I/O请求长时间得不到响应

Linus Elevator I/O scheduler

The Deadline I/O Scheduler

The Anticipatory I/O Scheduler

The CFQ I/O Scheduler

The Noop I/O Scheduler

Sorting by inode number is the most commonly used method for scheduling I/O requests in user space