Linux内核之页面换出详解
kswap线程主要用于页面的定期换出,接下来说说kswap线程的实现
int inactive_shortage(void){int shortage = 0;//系统应该维持的物理内存由xxxhigh跟target维持//实际的由下面3个函数统计,如果没法满足那就返回正数shortage += freepages.high;shortage += inactive_target;shortage -= nr_free_pages();shortage -= nr_inactive_clean_pages();shortage -= nr_inactive_dirty_pages;if (shortage > 0)return shortage;return 0;}
/** Check if there are zones with a severe shortage of free pages,* or if all zones have a minor shortage.*/int free_shortage(void){pg_data_t *pgdat = pgdat_list;//节点int sum = 0;int freeable = nr_free_pages() + nr_inactive_clean_pages();//实际空闲int freetarget = freepages.high + inactive_target / 3;//理论空闲//实际小于理论,直接返回差值,表示需要扩充/* Are we low on free pages globally? */if (freeable < freetarget)return freetarget - freeable;/* If not, are we very low on any particular zone? */do {int i;for(i = 0; i < MAX_NR_ZONES; i++) {zone_t *zone = pgdat->node_zones+ i;//获取管理区if (zone->size && (zone->inactive_clean_pages +zone->free_pages < zone->pages_min+1)) {//空闲页面+干净不活跃页面是否小于最低水准/* + 1 to have overlap with alloc_pages() !! */sum += zone->pages_min + 1;sum -= zone->free_pages;sum -= zone->inactive_clean_pages;}}pgdat = pgdat->node_next;} while (pgdat);return sum;}
/*** refill_inactive_scan - scan the active list and find pages to deactivate* @priority: the priority at which to scan* @oneshot: exit after deactivating one page** This function will scan a portion of the active list to find* unused pages, those pages will then be moved to the inactive list.*///据priority的值扫描队列一部分页面,priority为0时才全部扫描int refill_inactive_scan(unsigned int priority, int oneshot){struct list_head * page_lru;struct page * page;int maxscan, page_active = 0;//maxscan控制扫描页面数目int ret = 0;/* Take the lock while messing with the list... */spin_lock(&pagemap_lru_lock);maxscan = nr_active_pages >> priority;while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {page = list_entry(page_lru, struct page, lru);/* Wrong page on list?! (list corruption, should not happen) */if (!PageActive(page)) {//扫描的页面必须是在活跃队列中printk("VM: refill_inactive, wrong page on list.\n");list_del(page_lru);nr_active_pages--;continue;}/* 判断页面是否受到访问,,决定增加或减少寿命,如果减少寿命到0,那说明此页面很久都没访问了Do aging on the pages. */if (PageTestandClearReferenced(page)) {age_page_up_nolock(page);page_active = 1;} else {age_page_down_ageonly(page);/** Since we don't hold a reference on the page* ourselves, we have to do our test a bit more* strict then deactivate_page(). This is needed* since otherwise the system could hang shuffling* unfreeable pages from the active list to the* inactive_dirty list and back again...** SUBTLE: we can have buffer pages with count 1.*///缓冲页面如果引用计数大于1,说明还要用户空间映射,不能转为不活跃页面if (page->age == 0 && page_count(page) <=(page->buffers ? 2 : 1)) {deactivate_page_nolock(page);page_active = 0;} else {page_active = 1;}}/** If the page is still on the active list, move it* to the other end of the list. Otherwise it was* deactivated by age_page_down and we exit successfully.*/if (page_active || PageActive(page)) {list_del(page_lru);//如果页面还是活跃的,就放入活跃尾部list_add(page_lru, &active_list);} else {ret = 1;if (oneshot)//根据oneshot参数选择是否继续扫描一次break;}}spin_unlock(&pagemap_lru_lock);return ret;}
static int do_try_to_free_pages(unsigned int gfp_mask, int user){int ret = 0;/*如果页面紧缺,或者脏的不活跃页面的数量大于空闲页面跟不活跃干净页面的数目就需要调用page_launder试图把不活跃状态的脏页面洗净,使得它们成为立刻可分配的页面*/if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() +nr_inactive_clean_pages())ret += page_launder(gfp_mask, user);/*如果内存依旧紧缺* If needed, we move pages from the active list* to the inactive list. We also "eat" pages from* the inode and dentry cache whenever we do this.*///释放dentry目录项跟inode数据结构的缓存,即使关闭这些,页面也不会立刻释放//而是保存到lru队列作为后备if (free_shortage() || inactive_shortage()) {shrink_dcache_memory(6, gfp_mask);//释放dentry目录项缓存shrink_icache_memory(6, gfp_mask);//释放inode缓存ret += refill_inactive(gfp_mask, user);//user表示是否有等待队列的进程} else {/** 否则回收slab缓存*/kmem_cache_reap(gfp_mask);ret = 1;}return ret;}
int page_launder(int gfp_mask, int sync){int launder_loop, maxscan, cleaned_pages, maxlaunder;int can_get_io_locks;struct list_head * page_lru;struct page * page;/** We can only grab the IO locks (eg. for flushing dirty* buffers to disk) if __GFP_IO is set.*/can_get_io_locks = gfp_mask & __GFP_IO;launder_loop = 0;maxlaunder = 0;cleaned_pages = 0;dirty_page_rescan:spin_lock(&pagemap_lru_lock);maxscan = nr_inactive_dirty_pages;//避免重复处理同一页面,设定的变量//对不活跃脏页面队列扫描while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&maxscan-- > 0) {page = list_entry(page_lru, struct page, lru);/* Wrong page on list?! (list corruption, should not happen) */if (!PageInactiveDirty(page)) {检查其标志是否为1printk("VM: page_launder, wrong page on list.\n");list_del(page_lru);//从队列中删除nr_inactive_dirty_pages--;page->zone->inactive_dirty_pages--;continue;}/* 到了脏队列,由于可能受到访问,就会放入活跃页面队列Page is or was in use? Move it to the active list. */if (PageTestandClearReferenced(page) || page->age > 0 ||(!page->buffers && page_count(page) > 1) ||page_ramdisk(page)) {del_page_from_inactive_dirty_list(page);//删除非活跃队列add_page_to_active_list(page);//加入到活跃队列中continue;}/*页面是否被锁住,是的话表示把它移到队列尾部* The page is locked. IO in progress?* Move it to the back of the list.*/if (TryLockPage(page)) {list_del(page_lru);list_add(page_lru, &inactive_dirty_list);continue;}/** Dirty swap-cache page? Write it out if* last copy..*/if (PageDirty(page)) {//是脏页面int (*writepage)(struct page *) = page->mapping->a_ops->writepage;int result;if (!writepage)//如果没有提供具体写swp的函数,则放入活跃队列中goto page_active;/*判断是否是第一次扫描,是的话就移到队列尾部,继续 First time through? Move it to the back of the list */if (!launder_loop) {list_del(page_lru);list_add(page_lru, &inactive_dirty_list);UnlockPage(page);continue;}/* OK, do a physical asynchronous write to swap. */ClearPageDirty(page);//清除page结构的_dirty位,防止再次写入page_cache_get(page);//增加page->count表示多了一个用户操作此//页面,因为kswap线程把这个页面写出到swp设备中spin_unlock(&pagemap_lru_lock);result = writepage(page);page_cache_release(page);//count--完成了写入操作//所以就用户--了/* And re-start the thing.. */spin_lock(&pagemap_lru_lock);if (result != 1)//写入失败的话continue;/* writepage refused to do anything */set_page_dirty(page);//又设置为脏页goto page_active;}/** 如果页面不是脏的然后又是用于缓存文件读写的页面*/if (page->buffers) {int wait, clearedbuf;int freed_page = 0;/** Since we might be doing disk IO, we have to* drop the spinlock and take an extra reference* on the page so it doesn't go away from under us.*/del_page_from_inactive_dirty_list(page);//脱离脏队列page_cache_get(page);//表示kswap进程需要作用于page,count++spin_unlock(&pagemap_lru_lock);/* Will we do (asynchronous) IO? */if (launder_loop && maxlaunder == 0 && sync)wait = 2; /* Synchrounous IO */else if (launder_loop && maxlaunder-- > 0)wait = 1; /* Async IO */elsewait = 0; /* No IO *//*试图将页面释放,这里是count减一 Try to free the page buffers. */clearedbuf = try_to_free_buffers(page, wait);/** Re-take the spinlock. Note that we cannot* unlock the page yet since we're still* accessing the page_struct here...*/spin_lock(&pagemap_lru_lock);/* 不能释放或者说释放失败继续放入脏队列The buffers were not freed. */if (!clearedbuf) {add_page_to_inactive_dirty_list(page);/*/*页面只在buffer cache队列中,而不在某个文件的inode->i_mapping中,这样的页有超级块,索引节点位图等等,它们不属于某个文件,因此我们就成功释放了一个页面*/
如果该页面只用于缓存,而非映射The page was only in the buffer cache. */} else if (!page->mapping) {atomic_dec(&buffermem_pages);freed_page = 1;cleaned_pages++;/* *否则这个页面还在某个文件的inode->i_mapping中,并且还有超过2个用户(the cache and us)在访问它,例如有多个进程映射到该文件如果该页有几个用户,加入到活跃队列中The page has more users besides the cache and us. */} else if (page_count(page) > 2) {add_page_to_active_list(page);/* 最后,只剩下page->mapping && page_count(page) == 2,说明虽然这个页面还在某个inode->i_mapping中,但是已经没有任何用户在访问他们了,因此可以释放该页面OK, we "created" a freeable page. */} else /* page->mapping && page_count(page) == 2 */ {add_page_to_inactive_clean_list(page);cleaned_pages++;}/** Unlock the page and drop the extra reference.* We can only do it here because we ar accessing* the page struct above.*/UnlockPage(page);page_cache_release(page);//最终释放页面到空闲队列缓存中/** If we're freeing buffer cache pages, stop when* we've got enough free memory.释放了一个页面,并且系统内存不再紧缺,那就停止*/if (freed_page && !free_shortage())break;continue;//页面不再是脏页面,并且属于address_space红} else if (page->mapping && !PageDirty(page)) {/** If a page had an extra reference in* deactivate_page(), we will find it here.* Now the page is really freeable, so we* move it to the inactive_clean list.*/del_page_from_inactive_dirty_list(page);//转移到不活跃队列中add_page_to_inactive_clean_list(page);UnlockPage(page);cleaned_pages++;} else {page_active:/** OK, we don't know what to do with the page.* It's no use keeping it here, so we move it to* the active list.*/del_page_from_inactive_dirty_list(page);add_page_to_active_list(page);UnlockPage(page);}}spin_unlock(&pagemap_lru_lock);/** If we don't have enough free pages, we loop back once* to queue the dirty pages for writeout. When we were called* by a user process (that /needs/ a free page) and we didn't* free anything yet, we wait synchronously on the writeout of* MAX_SYNC_LAUNDER pages.** We also wake up bdflush, since bdflush should, under most* loads, flush out the dirty pages before we have to wait on* IO.*///如果内存继续紧缺,那就二次扫描一趟if (can_get_io_locks && !launder_loop && free_shortage()) {launder_loop = 1;/* If we cleaned pages, never do synchronous IO. */if (cleaned_pages)sync = 0;/* We only do a few "out of order" flushes. */maxlaunder = MAX_LAUNDER;/* Kflushd takes care of the rest. */wakeup_bdflush(0);goto dirty_page_rescan;}/* Return the number of pages moved to the inactive_clean list. */return cleaned_pages;//返回有多少页面被移到不活跃干净页面中}
/** We need to make the locks finer granularity, but right* now we need this so that we can do page allocations* without holding the kernel lock etc.** We want to try to free "count" pages, and we want to* cluster them so that we get good swap-out behaviour.** OTOH, if we're a user process (and not kswapd), we* really care about latency. In that case we don't try* to free too many pages.*/static int refill_inactive(unsigned int gfp_mask, int user){int priority, count, start_count, made_progress;count = inactive_shortage() + free_shortage();//获取需要的页面数目if (user)count = (1 << page_cluster);start_count = count;/* 任何时候,当页面紧缺时,从slab开始回收Always trim SLAB caches when memory gets low. */kmem_cache_reap(gfp_mask);priority = 6;//从最低优先级别6开始do {made_progress = 0;//每次循环都要检查下当前进程是否被设置被调度,设置了,说明某个中断程序需要调度if (current->need_resched) {__set_current_state(TASK_RUNNING);schedule();}//扫描活跃页面队列,试图从中找出可以转入不活跃状态页面while (refill_inactive_scan(priority, 1)) {made_progress = 1;if (--count <= 0)goto done;}/** don't be too light against the d/i cache since* refill_inactive() almost never fail when there's* really plenty of memory free.*/shrink_dcache_memory(priority, gfp_mask);shrink_icache_memory(priority, gfp_mask);/*试图找出一个进程,扫描其映射表,找到可以转入不活跃状态页面* Then, try to page stuff out..*/while (swap_out(priority, gfp_mask)) {made_progress = 1;if (--count <= 0)goto done;}/** If we either have enough free memory, or if* page_launder() will be able to make enough* free memory, then stop.*/if (!inactive_shortage() || !free_shortage())goto done;/** Only switch to a lower "priority" if we* didn't make any useful progress in the* last loop.*/if (!made_progress)priority--;} while (priority >= 0);/* Always end on a refill_inactive.., may sleep... */while (refill_inactive_scan(0, 1)) {if (--count <= 0)goto done;}done:return (count < start_count);}
static int swap_out(unsigned int priority, int gfp_mask){int counter;//循环次数int __ret = 0;/** We make one or two passes through the task list, indexed by* assign = {0, 1}:* Pass 1: select the swappable task with maximal RSS that has* not yet been swapped out.* Pass 2: re-assign rss swap_cnt values, then select as above.** With this approach, there's no need to remember the last task* swapped out. If the swap-out fails, we clear swap_cnt so the* task won't be selected again until all others have been tried.** Think of swap_cnt as a "shadow rss" - it tells us which process* we want to page out (always try largest first).*///根据内核中进程的个数跟调用swap_out的优先级计算得到的counter = (nr_threads << SWAP_SHIFT) >> priority;if (counter < 1)counter = 1;for (; counter >= 0; counter--) {struct list_head *p;unsigned long max_cnt = 0;struct mm_struct *best = NULL;int assign = 0;int found_task = 0;select:spin_lock(&mmlist_lock);p = init_mm.mmlist.next;for (; p != &init_mm.mmlist; p = p->next) {struct mm_struct *mm = list_entry(p, struct mm_struct, mmlist);if (mm->rss <= 0)continue;found_task++;/* Refresh swap_cnt? */if (assign == 1) {////增加这层判断目的是,但我们找不到mm->swap_cnt不为0的mm时候,我们就会设置assign=1,然后再从新扫描一遍,此次就会直接把内存页面数量赋值给尚未考察页面数量,从而从新刷新一次,这样我们就会从最富有的进程开始下手,mm->swap_cnt用于保证我们所说的轮流坐庄,mm->rss则是保证劫富济贫第二轮循环,将mm->rss拷贝到mm_swap_cnt,从最大的开始继续mm->swap_cnt = (mm->rss >> SWAP_SHIFT);//记录一次轮换中尚未内存页面尚未考察的数量if (mm->swap_cnt < SWAP_MIN)mm->swap_cnt = SWAP_MIN;}if (mm->swap_cnt > max_cnt) {max_cnt = mm->swap_cnt;best = mm;}}///从循环退出来,我们就找到了最大的mm->swap_cnt的mm/* Make sure it doesn't disappear */if (best)atomic_inc(&best->mm_users);spin_unlock(&mmlist_lock);/** We have dropped the tasklist_lock, but we* know that "mm" still exists: we are running* with the big kernel lock, and exit_mm()* cannot race with us.*/if (!best) {if (!assign && found_task > 0) {//第一次进入,表示所有进程mm->swap_cnt都为0,第2次不会再进入了,一般不会出现第2次assign = 1;//第二轮循环goto select;}break;} else {//扫出一个最佳换出的进程,调用swap_out_mm__ret = swap_out_mm(best, gfp_mask);mmput(best);break;}}return __ret;}
/** The swap-out functions return 1 if they successfully* threw something out, and we got a free page. It returns* zero if it couldn't do anything, and any other value* indicates it decreased rss, but the page was shared.** NOTE! If it sleeps, it *must* return 1 to make sure we* don't continue with the swap-out. Otherwise we may be* using a process that no longer actually exists (it might* have died while we slept).*/static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask){pte_t pte;swp_entry_t entry;struct page * page;int onlist;pte = *page_table;//获取页表项if (!pte_present(pte))//是否存在物理内存中goto out_failed;page = pte_page(pte);//获取具体的页if ((!VALID_PAGE(page)) || PageReserved(page))//页面不合法或者页面不允许换出swap分区goto out_failed;if (!mm->swap_cnt)return 1;//需要具体的考察访问一个页面,swap_cnt减一mm->swap_cnt--;onlist = PageActive(page);//判断是否活跃/* Don't look at this pte if it's been accessed recently. */if (ptep_test_and_clear_young(page_table)) {//测试页面是否访问过(访问过说明年轻)age_page_up(page);//增加保留观察时间goto out_failed;}if (!onlist)//即使不在活跃队列,而且最近没有访问,还不能立刻换出,而要保留观察,直到其//page->age等于0为止age_page_down_ageonly(page);/** If the page is in active use by us, or if the page* is in active use by others, don't unmap it or* (worse) start unneeded IO.*/if (page->age > 0)goto out_failed;if (TryLockPage(page))goto out_failed;/* From this point on, the odds are that we're going to* nuke this pte, so read and clear the pte. This hook* is needed on CPUs which update the accessed and dirty* bits in hardware.*///把页表项的内容清0(撤销了映射)pte = ptep_get_and_clear(page_table);flush_tlb_page(vma, address);/** Is the page already in the swap cache? If so, then* we can just drop our reference to it without doing* any IO - it's already up-to-date on disk.** Return 0, as we didn't actually free any real* memory, and we should just continue our scan.*/if (PageSwapCache(page)) {//判断该页是否已经在swap缓存中entry.val = page->index;if (pte_dirty(pte))set_page_dirty(page);//转入脏页面set_swap_pte:swap_duplicate(entry);//对index做一些印证set_pte(page_table, swp_entry_to_pte(entry));//设置pte为swap的索引了,这样完成了交换drop_pte:UnlockPage(page);mm->rss--;//物理页面断开的映射,所以rss--deactivate_page(page);//将其从活跃队列移到不活跃队列中page_cache_release(page);//释放页面缓存out_failed:return 0;}/** Is it a clean page? Then it must be recoverable* by just paging it in again, and we can just drop* it..** However, this won't actually free any real* memory, as the page will just be in the page cache* somewhere, and as such we should just continue* our scan.** Basically, this just makes it possible for us to do* some real work in the future in "refill_inactive()".*/flush_cache_page(vma, address);if (!pte_dirty(pte))goto drop_pte;/** Ok, it's really dirty. That means that* we should either create a new swap cache* entry for it, or we should write it back* to its own backing store.*/if (page->mapping) {set_page_dirty(page);goto drop_pte;}/** This is a dirty, swappable page. First of all,* get a suitable swap entry for it, and make sure* we have the swap cache set up to associate the* page with that swap entry.*/entry = get_swap_page();if (!entry.val)goto out_unlock_restore; /* No swap space left *//* Add it to the swap cache and mark it dirty */add_to_swap_cache(page, entry);set_page_dirty(page);goto set_swap_pte;out_unlock_restore:set_pte(page_table, pte);UnlockPage(page);return 0;}
Linux内核之页面换出详解的更多相关文章
- Linux内核数据结构之kfifo详解
本文分析的原代码版本: 2.6.24.4 kfifo的定义文件: kernel/kfifo.c kfifo的头文件: include/linux/kfifo.h kfifo是内核里面的一个First ...
- Linux 内核参数 arp_ignore & arp_announce 详解
arp_ignore定义了对目标地址为本机IP的ARP询问的不同应答模式. arp_announce对网络接口(网卡)上发出的ARP请求包中的源IP地址作出相应的限制:主机会根据这个参数值的不同选择使 ...
- Linux内核线程kernel thread详解--Linux进程的管理与调度(十)
内核线程 为什么需要内核线程 Linux内核可以看作一个服务进程(管理软硬件资源,响应用户进程的种种合理以及不合理的请求). 内核需要多个执行流并行,为了防止可能的阻塞,支持多线程是必要的. 内核线程 ...
- Linux内核线程kernel thread详解--Linux进程的管理与调度(十)【转】
转自:http://blog.csdn.net/gatieme/article/details/51589205 日期 内核版本 架构 作者 GitHub CSDN 2016-06-02 Linux- ...
- Linux内核中kzalloc函数详解
**************************************************************************************************** ...
- Linux内核中container_of函数详解
http://www.linuxidc.com/Linux/2016-08/134481.htm
- Linux内核源代码目录结构详解
http://blog.csdn.net/u013014440/article/details/44024207
- Linux进程上下文切换过程context_switch详解--Linux进程的管理与调度(二十一)
1 前景回顾 1.1 Linux的调度器组成 2个调度器 可以用两种方法来激活调度 一种是直接的, 比如进程打算睡眠或出于其他原因放弃CPU 另一种是通过周期性的机制, 以固定的频率运行, 不时的检测 ...
- Linux驱动开发必看详解神秘内核(完全转载)
Linux驱动开发必看详解神秘内核 完全转载-链接:http://blog.chinaunix.net/uid-21356596-id-1827434.html IT168 技术文档]在开始步入L ...
随机推荐
- POJ:2976-Dropping tests(二分平均值)
Dropping tests Time Limit: 1000MS Memory Limit: 65536K Total Submissions: 15508 Accepted: 5418 Descr ...
- [Bzoj2246]迷宫探险(概率+DP)
Description 题目链接 Solution 用三进制表示陷阱状态,1表示有害,2表示无害,0表示不知道 用\(f[S][i]\)表示状态为S时陷阱i有害的概率,这个可以预处理出 \(d[S][ ...
- Sqlite客户端的使用
打开一个数据库sqlite3 ${databaseName} 查看当前打开的数据库.database 查看当前打开的数据库中的表.table 查看指定表结构(实际输出是建表语句).schema ${t ...
- PHP.16-PDO
PHP 数据对象 (PDO) 扩展为PHP访问数据库定义了一个轻量级的一致接口.实现 PDO 接口的每个数据库驱动可以公开具体数据库的特性作为标准扩展功能. 注意利用 PDO 扩展自身并不能实现任何数 ...
- echart图表展示数据-简单的柱状图
话不多说,先上几张效果图 给大家看看 1:echart所用到的文件包需要事先引入好具体可见 http://echarts.baidu.com/doc/start.html 2:本例中所有的数据都是通过 ...
- css一些事儿
1. margin和padding 如果边界画一条线,则margin的属于边界外,padding属于边界内 当我们给元素背景色时,margin区域不会被着色,而padding区域会被着色. 当上下两个 ...
- 《Cracking the Coding Interview》——第17章:普通题——题目8
2014-04-28 23:35 题目:最大子数组和问题. 解法:O(n)解法. 代码: // 17.8 Find the consecutive subarray with maximum sum ...
- flask_入门教程之一
一.教程涉及开发语言.脚本.框架.数据库等内容 Python + Flask + requests 通过命令安装:pip install flask 二.创建第一个flask脚本 一个最小的 Flas ...
- C++ Primer 第2章 变量和基本类型
C++ Primer 第2章 变量和基本类型 C Primer 第2章 变量和基本类型 1 基本内置类型 算数类型 类型转换 字面值常量 2 变量 变量定义 3 复合类型 引用d左引用 指针d 4 c ...
- 团队冲刺Alpha(七)
目录 组员情况 组员1(组长):胡绪佩 组员2:胡青元 组员3:庄卉 组员4:家灿 组员5:凯琳 组员6:翟丹丹 组员7:何家伟 组员8:政演 组员9:黄鸿杰 组员10:刘一好 组员11:何宇恒 展示 ...