【转】Linux Writeback机制分析
1. bdi是什么?
bdi,即是backing device info的缩写,顾名思义它描述备用存储设备相关描述信息,这在内核代码里用一个结构体backing_dev_info来表示。
bdi,备用存储设备,简单点说就是能够用来存储数据的设备,而这些设备存储的数据能够保证在计算机电源关闭时也不丢失。这样说来,软盘存储设备、光驱存储设备、USB存储设备、硬盘存储设备都是所谓的备用存储设备(后面都用bdi来指示),而内存显然不是
2. bdi工作模型
相对于内存来说,bdi设备(比如最常见的硬盘存储设备)的读写速度是非常慢的,因此为了提高系统整体性能,Linux系统对bdi设备的读写内容进行了缓冲,那些读写的数据会临时保存在内存里,以避免每次都直接操作bdi设备,但这就需要在一定的时机(比如每隔5秒、脏数据达到的一定的比率等)把它们同步到bdi设备,否则长久的呆在内存里容易丢失(比如机器突然宕机、重启),而进行间隔性同步工作的进程之前名叫pdflush,但后来在Kernel 2.6.2x/3x对此进行了优化改进,产生有多个内核进程,bdi-default、flush-x:y等。
关于以前的pdflush不再多说,我们这里只讨论bdi-default和flush-x:y,这两个进程(事实上,flush-x:y为多个)的关系为父与子的关系,即bdi-default根据当前的状态Create或Destroy flush-x:y,x为块设备类型,y为此类设备的序号。如有两个TF卡,则分别为:flush-179:0、flush-179:1。
一般而言,一个Linux系统会挂载很多bdi设备,在bdi设备注册(函数:bdi_register(…))时,这些bdi设备会以链表的形式组织在全局变量bdi_list下,除了一个比较特别的bdi设备以外,它就是default bdi设备(default_backing_dev_info),它除了被加进到bdi_list,还会新建一个bdi-default内核进程,即本文的主角。具体代码如下,我相信你一眼就能注意到kthread_run和list_add_tail_rcu这样的关键代码。
- struct backing_dev_info default_backing_dev_info = {
- .name = "default",
- .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
- .state = 0,
- .capabilities = BDI_CAP_MAP_COPY,
- };
- EXPORT_SYMBOL_GPL(default_backing_dev_info);
- static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
- {
- return bdi == &default_backing_dev_info;
- }
- int bdi_register(struct backing_dev_info *bdi, struct device *parent,
- const char *fmt, ...)
- {
- va_list args;
- struct device *dev;
- if (bdi->dev) /* The driver needs to use separate queues per device */
- return 0;
- va_start(args, fmt);
- dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
- va_end(args);
- if (IS_ERR(dev))
- return PTR_ERR(dev);
- bdi->dev = dev;
- /*
- * Just start the forker thread for our default backing_dev_info,
- * and add other bdi's to the list. They will get a thread created
- * on-demand when they need it.
- */
- if (bdi_cap_flush_forker(bdi)) {
- struct bdi_writeback *wb = &bdi->wb;
- wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
- dev_name(dev));
- if (IS_ERR(wb->task))
- return PTR_ERR(wb->task);
- }
- bdi_debug_register(bdi, dev_name(dev));
- set_bit(BDI_registered, &bdi->state);
- spin_lock_bh(&bdi_lock);
- list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
- spin_unlock_bh(&bdi_lock);
- trace_writeback_bdi_register(bdi);
- return 0;
- }
- EXPORT_SYMBOL(bdi_register);
接着跟进函数bdi_forker_thread,它是bdi-default内核进程的主体:
- static int bdi_forker_thread(void *ptr)
- {
- struct bdi_writeback *me = ptr;
- current->flags |= PF_SWAPWRITE;
- set_freezable();
- /*
- * Our parent may run at a different priority, just set us to normal
- */
- set_user_nice(current, 0);
- for (;;) {
- struct task_struct *task = NULL;
- struct backing_dev_info *bdi;
- enum {
- NO_ACTION, /* Nothing to do */
- FORK_THREAD, /* Fork bdi thread */
- KILL_THREAD, /* Kill inactive bdi thread */
- } action = NO_ACTION;
- /*
- * Temporary measure, we want to make sure we don't see
- * dirty data on the default backing_dev_info
- */
- if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
- del_timer(&me->wakeup_timer);
- wb_do_writeback(me, 0);
- }
- spin_lock_bh(&bdi_lock);
- /*
- * In the following loop we are going to check whether we have
- * some work to do without any synchronization with tasks
- * waking us up to do work for them. Set the task state here
- * so that we don't miss wakeups after verifying conditions.
- */
- set_current_state(TASK_INTERRUPTIBLE);
- /* 遍历所有的bdi对象,检查这些bdi是否存在脏数据,如果有脏数据,那么需要为其fork线程,然后做writeback操作 */
- list_for_each_entry(bdi, &bdi_list, bdi_list) {
- bool have_dirty_io;
- if (!bdi_cap_writeback_dirty(bdi) ||
- bdi_cap_flush_forker(bdi))
- continue;
- WARN(!test_bit(BDI_registered, &bdi->state),
- "bdi %p/%s is not registered!\n", bdi, bdi->name);
- /* 检查是否存在脏数据 */
- have_dirty_io = !list_empty(&bdi->work_list) ||
- wb_has_dirty_io(&bdi->wb);
- /*
- * If the bdi has work to do, but the thread does not
- * exist - create it.
- */
- if (!bdi->wb.task && have_dirty_io) {
- /*
- * Set the pending bit - if someone will try to
- * unregister this bdi - it'll wait on this bit.
- */
- /* 如果有脏数据,并且不存在线程,那么接下来做线程的FORK操作 */
- set_bit(BDI_pending, &bdi->state);
- action = FORK_THREAD;
- break;
- }
- spin_lock(&bdi->wb_lock);
- /*
- * If there is no work to do and the bdi thread was
- * inactive long enough - kill it. The wb_lock is taken
- * to make sure no-one adds more work to this bdi and
- * wakes the bdi thread up.
- */
- /* 如果一个bdi长时间没有脏数据,那么执行线程的KILL操作,结束掉该bdi对应的writeback线程 */
- if (bdi->wb.task && !have_dirty_io &&
- time_after(jiffies, bdi->wb.last_active +
- bdi_longest_inactive())) {
- task = bdi->wb.task;
- bdi->wb.task = NULL;
- spin_unlock(&bdi->wb_lock);
- set_bit(BDI_pending, &bdi->state);
- action = KILL_THREAD;
- break;
- }
- spin_unlock(&bdi->wb_lock);
- }
- spin_unlock_bh(&bdi_lock);
- /* Keep working if default bdi still has things to do */
- if (!list_empty(&me->bdi->work_list))
- __set_current_state(TASK_RUNNING);
- /* 执行线程的FORK和KILL操作 */
- switch (action) {
- case FORK_THREAD:
- /* FORK一个bdi_writeback_thread线程,该线程的名字为flush-major:minor */
- __set_current_state(TASK_RUNNING);
- task = kthread_create(bdi_writeback_thread, &bdi->wb,
- "flush-%s", dev_name(bdi->dev));
- if (IS_ERR(task)) {
- /*
- * If thread creation fails, force writeout of
- * the bdi from the thread. Hopefully 1024 is
- * large enough for efficient IO.
- */
- writeback_inodes_wb(&bdi->wb, 1024,
- WB_REASON_FORKER_THREAD);
- } else {
- /*
- * The spinlock makes sure we do not lose
- * wake-ups when racing with 'bdi_queue_work()'.
- * And as soon as the bdi thread is visible, we
- * can start it.
- */
- spin_lock_bh(&bdi->wb_lock);
- bdi->wb.task = task;
- spin_unlock_bh(&bdi->wb_lock);
- wake_up_process(task);
- }
- bdi_clear_pending(bdi);
- break;
- case KILL_THREAD:
- /* KILL一个线程 */
- __set_current_state(TASK_RUNNING);
- kthread_stop(task);
- bdi_clear_pending(bdi);
- break;
- case NO_ACTION:
- /* 如果没有可执行的动作,那么调度本线程睡眠一段时间 */
- if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
- /*
- * There are no dirty data. The only thing we
- * should now care about is checking for
- * inactive bdi threads and killing them. Thus,
- * let's sleep for longer time, save energy and
- * be friendly for battery-driven devices.
- */
- schedule_timeout(bdi_longest_inactive());
- else
- schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
- try_to_freeze();
- break;
- }
- }
- return 0;
- }
3. bdi相关数据结构
在bdi数据结构中定义了一个writeback对象,该对象是对writeback内核线程的描述,并且封装了需要处理的inode队列。在bdi数据结构中有一条work_list,该work队列维护了writeback内核线程需要处理的任务。如果该队列上没有work可以处理,那么writeback内核线程将会睡眠等待。
writeback
writeback对象封装了内核线程task以及需要处理的inode队列。当page cache/buffer cache需要刷新radix tree上的inode时,可以将该inode挂载到writeback对象的b_dirty队列上,然后唤醒writeback线程。在处理过程中,inode会被移到b_io队列上进行处理。多条链表的方式可以降低多线程之间的资源共享。writeback数据结构具体定义如下:
- struct bdi_writeback {
- struct backing_dev_info *bdi; /* our parent bdi */
- unsigned int nr;
- unsigned long last_old_flush; /* last old data flush */
- unsigned long last_active; /* last time bdi thread was active */
- struct task_struct *task; /* writeback thread */
- struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */
- struct list_head b_dirty; /* dirty inodes */
- struct list_head b_io; /* parked for writeback */
- struct list_head b_more_io; /* parked for more writeback */
- spinlock_t list_lock; /* protects the b_* lists */
- };
writeback work
wb_writeback_work数据结构是对writeback任务的封装,不同的任务可以采用不同的刷新策略。writeback线程的处理对象就是writeback_work。如果writeback_work队列为空,那么内核线程就可以睡眠了。
Writeback_work的数据结构定义如下:
- struct wb_writeback_work {
- long nr_pages;
- struct super_block *sb; /* superblock对象 */
- unsigned long *older_than_this;
- enum writeback_sync_modes sync_mode;
- unsigned int tagged_writepages:1;
- unsigned int for_kupdate:1;
- unsigned int range_cyclic:1;
- unsigned int for_background:1;
- enum wb_reason reason; /* why was writeback initiated? */
- struct list_head list; /* pending work list,链入bdi-> work_list队列 */
- struct completion *done; /* set if the caller waits,work完成时通知调用者 */
- };
4. writeback主要函数分析
writeback机制的主要函数包括如下两个方面:
1. 管理bdi对象并且fork相应的writeback内核线程处理cache数据的刷新工作。
2. writeback内核线程处理函数,实现dirty page的刷新操作
writeback线程管理
Linux中有一个内核守护线程,该线程用来管理系统bdi队列,并且负责为block device创建writeback thread。当bdi中有dirty page并且还没有为bdi分配内核线程的时候,bdi_forker_thread程序会为其分配线程资源;当一个writeback线程长时间处于空闲状态时,bdi_forker_thread程序会释放该线程资源。
转自:http://blog.csdn.net/myarrow/article/details/8918944
【转】Linux Writeback机制分析的更多相关文章
- 【原创】Linux信号量机制分析
背景 Read the fucking source code! --By 鲁迅 A picture is worth a thousand words. --By 高尔基 说明: Kernel版本: ...
- 【原创】Linux Mutex机制分析
背景 Read the fucking source code! --By 鲁迅 A picture is worth a thousand words. --By 高尔基 说明: Kernel版本: ...
- linux dpm机制分析(上)【转】
转自:http://blog.csdn.net/lixiaojie1012/article/details/23707681 1 DPM介绍 1.1 Dpm: 设备电源管理, ...
- linux dpm机制分析(下)【转】
转自:http://blog.csdn.net/lixiaojie1012/article/details/23707901 1 设备注册到dpm_list路径 (Platform_devi ...
- Linux mips64r2 PCI中断路由机制分析
Linux mips64r2 PCI中断路由机制分析 本文主要分析mips64r2 PCI设备中断路由原理和irq号分配实现方法,并尝试回答如下问题: PCI设备驱动中断注册(request_irq) ...
- Linux信号(signal) 机制分析
Linux信号(signal) 机制分析 [摘要]本文分析了Linux内核对于信号的实现机制和应用层的相关处理.首先介绍了软中断信号的本质及信号的两种不同分类方法尤其是不可靠信号的原理.接着分析了内核 ...
- Linux内核态抢占机制分析(转)
Linux内核态抢占机制分析 http://blog.sina.com.cn/s/blog_502c8cc401012pxj.html 摘 要]本文首先介绍非抢占式内核(Non-Preemptive ...
- Linux进程组调度机制分析【转】
转自:http://oenhan.com/task-group-sched 又碰到一个神奇的进程调度问题,在系统重启过程中,发现系统挂住了,过了30s后才重新复位,真正系统复位的原因是硬件看门狗重启的 ...
- Linux x86_64 APIC中断路由机制分析
不同CPU体系间的中断控制器工作原理有较大差异,本文是<Linux mips64r2 PCI中断路由机制分析>的姊妹篇,主要分析Broadwell-DE X86_64 APIC中断路由原理 ...
随机推荐
- ruby4种比较符号
The == comparison checks whether two values are equal eql? checks if two values are equal and of the ...
- CentOS7 MongoDB安裝
查看MongoDB的最新版官方下载地址: https://www.mongodb.com/download-center#community 使用wget命令下载安装包 ? 1 wget https: ...
- java从控制台读取数据的方式
1. Scanner sc = new Scanner(System.in); String s = sc.readLine(); 2. BufferReader br = new BufferRea ...
- gc 日志格式
对于 Parallel Scavenge + Parallel Old - 新生代:PSYoungGen - 老年代:ParOldGen - 永久代:PSPermGen 后面方括号内部的 " ...
- 算法库:jpeglib和pnglib安装配置
类似于OpenCV的安装配置.只不过OpenCV有编译好的,而jpeglib和pnglib需要自己编译.其实,若要跟踪OpenCV的源码或要使用OpenCV的扩展包,OpenCV也得自己编译. Ope ...
- Error Dropping Database (Can't rmdir '.test\', errno: 17)
MySql 删除数据库出错:Can't rmdir '.\test\', errno: 17 到test数据下的所在的目前data\test目录,删除掉所有的文件后,就可以删除数据了
- Eclipse自定义Ant版本
changed the ANT_HOME in the Windows>Preferences>Ant>Runtime>Classpath>Ant Home>浏览文 ...
- (转)读取XML数据到treeView中
原文地址,只为收藏:http://www.cnblogs.com/ylwn817/archive/2011/12/15/2288512.html /// <summary> ...
- relatedTarget、fromElement、toElement之间的关系
在发生mouseover和mouseout事件时会把鼠标指针从一个元素的边界之内移到另一个元素边界之内.对 mouseover事件而言,事件的target是获得光标的元素,而relatedTarget ...
- namespace用法
1.在WCF.Controller中定义了一个UserModel,标记为① 2.在WCF.Controller.Model中定义了一个UserModel(同上,namespace不同),标记为② 3. ...