众所周知,进程调度使用schedule()函数来完成,下面我们从分析该函数开始,代码如下(kernel/sched/core.c):

 asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current; sched_submit_work(tsk);
__schedule();
}
EXPORT_SYMBOL(schedule);

第3行获取当前进程描述符指针,存放在本地变量tsk中。第6行调用__schedule(),代码如下(kernel/sched/core.c)。

 static void __sched __schedule(void)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
struct rq *rq;
int cpu; need_resched:
preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
rcu_note_context_switch(cpu);
prev = rq->curr; schedule_debug(prev); if (sched_feat(HRTICK))
hrtick_clear(rq); /*
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
*/
smp_mb__before_spinlock();
raw_spin_lock_irq(&rq->lock); switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING;
} else {
deactivate_task(rq, prev, DEQUEUE_SLEEP);
prev->on_rq = ; /*
* If a worker went to sleep, notify and ask workqueue
* whether it wants to wake up a task to maintain
* concurrency.
*/
if (prev->flags & PF_WQ_WORKER) {
struct task_struct *to_wakeup; to_wakeup = wq_worker_sleeping(prev, cpu);
if (to_wakeup)
try_to_wake_up_local(to_wakeup);
}
}
switch_count = &prev->nvcsw;
} if (prev->on_rq || rq->skip_clock_update < )
update_rq_clock(rq); next = pick_next_task(rq, prev);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
rq->skip_clock_update = ; if (likely(prev != next)) {
rq->nr_switches++;
rq->curr = next;
++*switch_count; context_switch(rq, prev, next); /* unlocks the rq */
/*
* The context switch have flipped the stack from under us
* and restored the local variables which were saved when
* this task called schedule() in the past. prev == current
* is still correct, but it can be moved to another cpu/rq.
*/
cpu = smp_processor_id();
rq = cpu_rq(cpu);
} else
raw_spin_unlock_irq(&rq->lock); post_schedule(rq); sched_preempt_enable_no_resched();
if (need_resched())
goto need_resched;
}

第9行禁止内核抢占。第10行获取当前的cpu号。第11行获取当前cpu的进程运行队列。第13行将当前进程的描述符指针保存在prev变量中。第55行将下一个被调度的进程描述符指针存放在next变量中。第56行清除当前进程的内核抢占标记。第60行判断当前进程和下一个调度的是不是同一个进程,如果不是的话,就要进行调度。第65行,对当前进程和下一个进程的上下文进行切换(调度之前要先切换上下文)。下面看看该函数(kernel/sched/core.c):

 context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); mm = next->mm;
oldmm = prev->active_mm;
/*
* For paravirt, this is coupled with an exit in switch_to to
* combine the page table reload and the switch backend into
* one hypercall.
*/
arch_start_context_switch(prev); if (!mm) {
next->active_mm = oldmm;
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next);
} else
switch_mm(oldmm, mm, next); if (!prev->mm) {
prev->active_mm = NULL;
rq->prev_mm = oldmm;
}
/*
* Since the runqueue lock will be released by the next
* task (which is an invalid locking op but in the case
* of the scheduler it's an obvious special-case), so we
* do an early lockdep release here:
*/
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
spin_release(&rq->lock.dep_map, , _THIS_IP_);
#endif context_tracking_task_switch(prev, next);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev); barrier();
/*
* this_rq must be evaluated again because prev may have moved
* CPUs since it called schedule(), thus the 'rq' on its stack
* frame will be invalid.
*/
finish_task_switch(this_rq(), prev);
}

上下文切换一般分为两个,一个是硬件上下文切换(指的是cpu寄存器,要把当前进程使用的寄存器内容保存下来,再把下一个程序的寄存器内容恢复),另一个是切换进程的地址空间(说白了就是程序代码)。进程的地址空间(程序代码)主要保存在进程描述符中的struct mm_struct结构体中,因此该函数主要是操作这个结构体。第17行如果被调度的下一个进程地址空间mm为空,说明下个进程是个线程,没有独立的地址空间,共用所属进程的地址空间,因此第18行将上个进程所使用的地址空间active_mm指针赋给下一个进程的该域,下一个进程也使用这个地址空间。第22行,如果下个进程地址空间不为空,说明下个进程有自己的地址空间,那么执行switch_mm切换进程页表。第40行切换进程的硬件上下文。 switch_to函数代码如下(arch/x86/include/asm/switch_to.h):

 #define switch_to(prev, next, last)                    \
do { \
/* \
* Context-switching clobbers all registers, so we clobber \
* them explicitly, via unused output variables. \
* (EAX and EBP is not listed because EBP is saved/restored \
* explicitly for wchan access and EAX is the return value of \
* __switch_to()) \
*/ \
unsigned long ebx, ecx, edx, esi, edi; \
\
asm volatile("pushfl\n\t" /* save flags */ \
"pushl %%ebp\n\t" /* save EBP */ \
"movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
"movl %[next_sp],%%esp\n\t" /* restore ESP */ \
"movl $1f,%[prev_ip]\n\t" /* save EIP */ \
"pushl %[next_ip]\n\t" /* restore EIP */ \
__switch_canary \
"jmp __switch_to\n" /* regparm call */ \
"1:\t" \
"popl %%ebp\n\t" /* restore EBP */ \
"popfl\n" /* restore flags */ \
\
/* output parameters */ \
: [prev_sp] "=m" (prev->thread.sp), \
[prev_ip] "=m" (prev->thread.ip), \
"=a" (last), \
\
/* clobbered output registers: */ \
"=b" (ebx), "=c" (ecx), "=d" (edx), \
"=S" (esi), "=D" (edi) \
\
__switch_canary_oparam \
\
/* input parameters: */ \
: [next_sp] "m" (next->thread.sp), \
[next_ip] "m" (next->thread.ip), \
\
/* regparm parameters for __switch_to(): */ \
[prev] "a" (prev), \
[next] "d" (next) \
\
__switch_canary_iparam \
\
: /* reloaded segment registers */ \
"memory"); \
} while ()

该函数中使用了内联汇编来完成进程的硬件上下文切换。第12-13行将eflags和ebp寄存器的值压栈,因为当进程再次切换回来后要用到这两个寄存器的值。第14行将当前进程的栈顶指针保存到进程的thread_info.sp中。第15行将下个进程的thread_info.sp中的值恢复到esp寄存器中,切换到下个进程的内核栈,至此,进程切换就完成了(进程内核栈的切换是进程切换的标志),后边代码的执行就是在新进程中进行。第16行将标号1所代表的地址存放到上个进程的thread_info.ip中,以后如果切换到上个进程,就从thread_info.ip所指向的代码处执行(实际上,你想让上个进程再次被切换到时从哪个指令开始执行,就将该指令的地址保存在上个进程的thread_info.ip中,进程的现场保护和函数调用时候的现场保护是有区别的,函数调用的现场保护是将寄存器的值压栈(毕竟堆栈没有切换),然后恢复现场时再将寄存器的值弹出来;进程切换的现场保护是将寄存器的值存入进程的thread_info结构中,当被切换掉的进程再次执行时,再从thread_info结构中恢复现场,毕竟进程切换了连内核堆栈都一同换掉了,所以必定要将进程的资源保存在和进程相关的数据结构中,才不会丢失而且容易被恢复)。第17行将当前进程的thread_info.ip压入内核栈中,一会要从这个ip指向的指令开始执行。第19行跳入到__switch_to函数中。下面看下__switch_to函数代码(arch/x86/kernel/process_32.c):

 __visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(init_tss, cpu);
fpu_switch_t fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ fpu = switch_fpu_prepare(prev_p, next_p, cpu); /*
* Reload esp0.
*/
load_sp0(tss, next); /*
* Save away %gs. No need to save %fs, as it was saved on the
* stack on entry. No need to save %es and %ds, as those are
* always kernel segments while inside the kernel. Doing this
* before setting the new TLS descriptors avoids the situation
* where we temporarily have non-reloadable segments in %fs
* and %gs. This could be an issue if the NMI handler ever
* used %fs or %gs (it does not today), or if the kernel is
* running inside of a hypervisor layer.
*/
lazy_save_gs(prev->gs); /*
* Load the per-thread Thread-Local Storage descriptor.
*/
load_TLS(next, cpu); /*
* Restore IOPL if needed. In normal use, the flags restore
* in the switch assembly will handle this. But if the kernel
* is running virtualized at a non-zero CPL, the popf will
* not restore flags, so it must be done in a separate step.
*/
if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
set_iopl_mask(next->iopl); /*
* If it were not for PREEMPT_ACTIVE we could guarantee that the
* preempt_count of all tasks was equal here and this would not be
* needed.
*/
task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); /*
* Now maybe handle debug registers and/or IO bitmaps
*/
if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
__switch_to_xtra(prev_p, next_p, tss); /*
* Leave lazy mode, flushing any hypercalls made here.
* This must be done before restoring TLS segments so
* the GDT and LDT are properly updated, and must be
* done before math_state_restore, so the TS bit is up
* to date.
*/
arch_end_context_switch(next_p); this_cpu_write(kernel_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE - KERNEL_STACK_OFFSET); /*
* Restore %gs if needed (which is common)
*/
if (prev->gs | next->gs)
lazy_load_gs(next->gs); switch_fpu_finish(next_p, fpu); this_cpu_write(current_task, next_p); return prev_p;
}

该函数主要是对刚切换过来的新进程进一步做些初始化工作。比如第34将该进程使用的线程局部存储段(TLS)装入本地cpu的全局描述符表。第84行返回语句会被编译成两条汇编指令,一条是将返回值prev_p保存到eax寄存器,另外一个是ret指令,将内核栈顶的元素弹出eip寄存器,从这个eip指针处开始执行,也就是上个函数第17行所压入的那个指针。一般情况下,被压入的指针是上个函数第20行那个标号1所代表的地址,那么从__switch_to函数返回后,将从标号1处开始运行。

需要注意的是,对于已经被调度过的进程而言,从__switch_to函数返回后,将从标号1处开始运行;但是对于用fork(),clone()等函数刚创建的新进程(未调度过),将进入ret_from_fork()函数,因为do_fork()函数在创建好进程之后,会给进程的thread_info.ip赋予ret_from_fork函数的地址,而不是标号1的地址,因此它会跳入ret_from_fork函数。后边我们在分析fork系统调用的时候,就会看到。

linux进程调度函数浅析(基于3.16-rc4)的更多相关文章

  1. linux fork函数浅析

    #include <sys/types.h> #include <unistd.h> /* 功能:复制进程 參数:无 返回值: 成功: 父进程:返回子进程id 子进程:返回0 ...

  2. Linux Kernel ‘write_tag_3_packet()’函数本地基于堆的缓冲区溢出漏洞

    漏洞名称: Linux Kernel ‘write_tag_3_packet()’函数本地基于堆的缓冲区溢出漏洞 CNNVD编号: CNNVD-201311-067 发布时间: 2013-11-07 ...

  3. Linux进程调度原理

    Linux进程调度原理 Linux进程调度机制 Linux进程调度的目标 1.高效性:高效意味着在相同的时间下要完成更多的任务.调度程序会被频繁的执行,所以调度程序要尽可能的高效: 2.加强交互性能: ...

  4. Linux系统启动那些事—基于Linux 3.10内核【转】

    转自:https://blog.csdn.net/shichaog/article/details/40218763 Linux系统启动那些事—基于Linux 3.10内核 csdn 我的空间的下载地 ...

  5. Linux进程调度原理【转】

    转自:http://www.cnblogs.com/zhaoyl/archive/2012/09/04/2671156.html Linux进程调度的目标 1.高效性:高效意味着在相同的时间下要完成更 ...

  6. [转载]Linux进程调度原理

    [转载]Linux进程调度原理 Linux进程调度原理 Linux进程调度的目标 1.高效性:高效意味着在相同的时间下要完成更多的任务.调度程序会被频繁的执行,所以调度程序要尽可能的高效: 2.加强交 ...

  7. Linux模块机制浅析

    Linux模块机制浅析   Linux允许用户通过插入模块,实现干预内核的目的.一直以来,对linux的模块机制都不够清晰,因此本文对内核模块的加载机制进行简单地分析. 模块的Hello World! ...

  8. linux进程调度之 FIFO 和 RR 调度策略

    转载 http://blog.chinaunix.net/uid-24774106-id-3379478.html    linux进程调度之 FIFO 和 RR 调度策略 2012-10-19 18 ...

  9. Linux就这个范儿 第16章 谁都可以从头再来--从头开始编译一套Linux系统 nsswitch.conf配置文件

    Linux就这个范儿 第16章 谁都可以从头再来--从头开始编译一套Linux系统  nsswitch.conf配置文件 朋友们,今天我对你们说,在此时此刻,我们虽然遭受种种困难和挫折,我仍然有一个梦 ...

随机推荐

  1. 77. Combinations

    题目: Given two integers n and k, return all possible combinations of k numbers out of 1 ... n. For ex ...

  2. 浅析CDN安全

    目前CDN技术到处可见.像网宿.蓝讯.加速乐等都依靠CDN过活,连安全宝也都使用了CDN技术,当然很多域名空间商现在也提供CDN服务.从以往互联网的发展上看,CDN是个趋势,很多厂商也都多多少少购买了 ...

  3. 值得珍藏的.NET源码,不保存就没机会了

    很早以前,我们通过http://referencesource.microsoft.com/netframework.aspx可以下载到.NET的各版本公开源码,但如今,微软对sscli项目进行了改版 ...

  4. 奇怪的transform bug

    对一个元素使用transform:rotate 进行旋转,造成: 父元素的背景图位置偏移,往下降,背景图也会变模糊一些 造成重绘,导致该元素后面的兄弟元素受到影响,变得模糊,并且无法遮盖住父元素的背景 ...

  5. C Socket Programming for Linux with a Server and Client Example Code

    Typically two processes communicate with each other on a single system through one of the following ...

  6. 完美配置Tomcat的HTTPS

    Tomcat配置HTTPS的文章到处都有,过程也比较简单,随后文中会转一段过来. 但对于启用APR情况下报异常“java.lang.Exception: Connector attribute SSL ...

  7. UserAccountInfo时间倒计时

    界面如下: 代码如下: using System;using System.Collections.Generic;using System.ComponentModel;using System.D ...

  8. Hibernate框架简述

    Hibernate的核心组件在基 于MVC设计模式的JAVA WEB应用中,Hibernate可以作为模型层/数据访问层.它通过配置文件(hibernate.properties或 hibernate ...

  9. 好!recover-binary-search-tree(难)& 两种好的空间O(n)解法 & 空间O(1)解法

    https://leetcode.com/mockinterview/session/result/xyc51it/https://leetcode.com/problems/recover-bina ...

  10. UVa 10305 (拓扑排序) Ordering Tasks

    题意: 经典的拓扑排序.有n个任务,然后某些任务必须安排在某些任务前面完成,输出一种满足要求的序列. 分析: 拓扑排序用离散里面的话来说就是将偏序关系拓展为全序关系.我们将“小于”这种关系看做一条有向 ...