LINUX是一个时间多路复用的系统,就是说,通过把CPU执行时间分成许多片,再分配给进程们使用,造成即使单CPU系统,也貌似允许多个任务在同时执行 。那么,时间片大小假设为100ms,过短过长,过长了有些不灵敏,过短了,连切换进程时可能都要消耗几毫秒的时间 。分给100个进程执行,在所有进程都用完自己的时间片后,需要重新给所有的进程重新分配时间片,怎么分配呢?for循环遍历所有的run状态进程,重设时间片?这个性能无法容忍!太慢了,跟当前系统进程数相关 。那么2.6内核怎么做的呢?它用了上面提到的两个优先级队列active和expired,顾名思义,active是还有时间片的进程队列,而expired是时间片耗尽必须重新分配时间片的进程队列 。
这么设计的好处就是不用再循环一遍所有进程重设时间片了,看看调度函数是怎么玩的:
array = rq->active; if (unlikely(!array->nr_active)) {/** Switch the active and expired arrays.*/schedstat_inc(rq, sched_switch);rq->active = rq->expired;rq->expired = array;array = rq->active;rq->expired_timestamp = 0;rq->best_expired_prio = MAX_PRIO; } elseschedstat_inc(rq, sched_noswitch);当所有运行进程的时间片都用完时,就把active和expired队列互换指针,没有遍历哦,而时间片耗尽的进程在出acitve队列入expired队列时,已经单独的重新分配好新时间片了 。
再看一下schedule(void)调度函数,当某个进程休眠或者被抢占时,系统就开始调试schedule(void)决定接下来运行哪个进程 。上面说过的东东都在这个函数里有体现哈 。
asmlinkage void __sched schedule(void){ long *switch_count; task_t *prev, *next; runqueue_t *rq; prio_array_t *array; struct list_head *queue; unsigned long long now; unsigned long run_time; int cpu, idx;/** Test if we are atomic.Since do_exit() needs to call into* schedule() atomically, we ignore that path for now.* Otherwise, whine if we are scheduling when we should not be.*/ if (likely(!(current->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)))) {先看看当前运行进程的状态if (unlikely(in_atomic())) {printk(KERN_ERR "scheduling while atomic: ""%s/0x%08x/%dn",current->comm, preempt_count(), current->pid);dump_stack();} } profile_hit(SCHED_PROFILING, __builtin_return_address(0));need_resched: preempt_disable(); prev = current; release_kernel_lock(prev);need_resched_nonpreemptible: rq = this_rq();这行找到这个CPU对应的runqueue,再次强调,每个CPU有一个自己的runqueue/** The idle thread is not allowed to schedule!* Remove this check after it has been exercised a bit.*/ if (unlikely(current == rq->idle) && current->state != TASK_RUNNING) {printk(KERN_ERR "bad: scheduling from the idle thread!n");dump_stack(); }schedstat_inc(rq, sched_cnt); now = sched_clock(); if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))run_time = now - prev->timestamp; elserun_time = NS_MAX_SLEEP_AVG;/** Tasks with interactive credits get charged less run_time* at high sleep_avg to delay them losing their interactive* status*/ if (HIGH_CREDIT(prev))run_time /= (CURRENT_BONUS(prev) ? : 1);spin_lock_irq(&rq->lock);if (unlikely(current->flags & PF_DEAD))current->state = EXIT_DEAD; /** if entering off of a kernel preemption go straight* to picking the next task.*/ switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {switch_count = &prev->nvcsw;if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&unlikely(signal_pending(prev))))prev->state = TASK_RUNNING;else {if (prev->state == TASK_UNINTERRUPTIBLE)rq->nr_uninterruptible++;deactivate_task(prev, rq);} }cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) {go_idle:idle_balance(cpu, rq);if (!rq->nr_running) {next = rq->idle;rq->expired_timestamp = 0;wake_sleeping_dependent(cpu, rq);/** wake_sleeping_dependent() might have released* the runqueue, so break out if we got new* tasks meanwhile:*/if (!rq->nr_running)goto switch_tasks;} } else {if (dependent_sleeper(cpu, rq)) {next = rq->idle;goto switch_tasks;}/** dependent_sleeper() releases and reacquires the runqueue* lock, hence go into the idle loop if the rq went* empty meanwhile:*/if (unlikely(!rq->nr_running))goto go_idle; }array = rq->active; if (unlikely(!array->nr_active)) {上面说过的,需要重新计算时间片时,就用已经计算好的expired队列了/** Switch the active and expired arrays.*/schedstat_inc(rq, sched_switch);rq->active = rq->expired;rq->expired = array;array = rq->active;rq->expired_timestamp = 0;rq->best_expired_prio = MAX_PRIO; } elseschedstat_inc(rq, sched_noswitch);idx = sched_find_first_bit(array->bitmap);找到优先级最高的队列 queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list);if (!rt_task(next) && next->activated > 0) {unsigned long long delta = now - next->timestamp;if (next->activated == 1)delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;array = next->array;dequeue_task(next, array);recalc_task_prio(next, next->timestamp + delta);enqueue_task(next, array); } next->activated = 0;switch_tasks: if (next == rq->idle)schedstat_inc(rq, sched_goidle); prefetch(next); clear_tsk_need_resched(prev); rcu_qsctr_inc(task_cpu(prev));prev->sleep_avg -= run_time; if ((long)prev->sleep_avg <= 0) {prev->sleep_avg = 0;if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))prev->interactive_credit--; } prev->timestamp = prev->last_ran = now;sched_info_switch(prev, next); if (likely(prev != next)) {表面现在正在执行的进程,不是选出来的优先级最高的进程next->timestamp = now;rq->nr_switches++;rq->curr = next;++*switch_count;prepare_arch_switch(rq, next);prev = context_switch(rq, prev, next);所以需要完成进程上下文切换,把之前的进程信息CACHE住barrier();finish_task_switch(prev); } elsespin_unlock_irq(&rq->lock);prev = current; if (unlikely(reacquire_kernel_lock(prev) < 0))goto need_resched_nonpreemptible; preempt_enable_no_resched(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))goto need_resched;}
推荐阅读
- 图解Linux网络包接收过程
- 理解了Linux I/O机制,才能真的明白“什么是多线程”
- Linux网络配置
- 带你重新认识Linux系统的inode
- 从Linux源码看Socket的listen及连接队列
- linux网络编程常见API详解
- 2 种从 Linux 终端下载文件的方法
- 在 Ubuntu 和其他 Linux 发行版上使用 Yarn
- Linux 软链接和硬链接
- Linux下diff的操作详解
