1 --- linux/fs/proc/proc_misc.c.orig Tue Feb 5 13:51:49 2002
2 +++ linux/fs/proc/proc_misc.c Tue Feb 5 13:52:12 2002
4 a = avenrun[0] + (FIXED_1/200);
5 b = avenrun[1] + (FIXED_1/200);
6 c = avenrun[2] + (FIXED_1/200);
7 - len = sprintf(page,"%d.%02d %d.%02d %d.%02d %d/%d %d\n",
8 + len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
9 LOAD_INT(a), LOAD_FRAC(a),
10 LOAD_INT(b), LOAD_FRAC(b),
11 LOAD_INT(c), LOAD_FRAC(c),
12 - nr_running, nr_threads, last_pid);
13 + nr_running(), nr_threads, last_pid);
14 return proc_calc_metrics(page, start, off, count, eof, len);
21 - idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;
22 + idle = init_task.times.tms_utime + init_task.times.tms_stime;
24 /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
25 that would overflow about every five days at HZ == 100.
29 len += sprintf(page + len,
34 - kstat.context_swtch,
35 + nr_context_switches(),
36 xtime.tv_sec - jif / HZ,
39 --- linux/fs/proc/array.c.orig Tue Feb 5 13:51:45 2002
40 +++ linux/fs/proc/array.c Tue Feb 5 13:52:12 2002
43 /* scale priority and nice values from timeslices to -20..20 */
44 /* to make it look like a "normal" Unix priority/nice value */
45 - priority = task->counter;
46 - priority = 20 - (priority * 10 + DEF_COUNTER / 2) / DEF_COUNTER;
48 + priority = task_prio(task);
49 + nice = task_nice(task);
51 read_lock(&tasklist_lock);
52 ppid = task->pid ? task->p_opptr->pid : 0;
62 --- linux/fs/nfs/pagelist.c.orig Tue Feb 5 13:51:50 2002
63 +++ linux/fs/nfs/pagelist.c Tue Feb 5 13:52:12 2002
66 if (signalled() && (server->flags & NFS_MOUNT_INTR))
67 return ERR_PTR(-ERESTARTSYS);
68 - current->policy = SCHED_YIELD;
73 /* Initialize the request struct. Initially, we assume a
74 --- linux/fs/ufs/truncate.c.orig Tue Feb 5 13:51:53 2002
75 +++ linux/fs/ufs/truncate.c Tue Feb 5 13:52:12 2002
77 if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
78 ufs_sync_inode (inode);
79 run_task_queue(&tq_disk);
80 - current->policy |= SCHED_YIELD;
86 offset = inode->i_size & uspi->s_fshift;
88 --- linux/fs/reiserfs/buffer2.c.orig Tue Feb 5 13:51:51 2002
89 +++ linux/fs/reiserfs/buffer2.c Tue Feb 5 13:52:12 2002
91 buffer_journal_dirty(bh) ? ' ' : '!');
93 run_task_queue(&tq_disk);
94 - current->policy |= SCHED_YIELD;
98 if (repeat_counter > 30000000) {
99 reiserfs_warning("vs-3051: done waiting, ignore vs-3050 messages for (%b)\n", bh) ;
101 struct buffer_head * reiserfs_bread (struct super_block *super, int n_block, int n_size)
103 struct buffer_head *result;
104 - PROC_EXP( unsigned int ctx_switches = kstat.context_swtch );
105 + PROC_EXP( unsigned int ctx_switches = nr_context_switches(); );
107 result = bread (super -> s_dev, n_block, n_size);
108 PROC_INFO_INC( super, breads );
109 - PROC_EXP( if( kstat.context_swtch != ctx_switches )
110 + PROC_EXP( if( nr_context_switches() != ctx_switches )
111 PROC_INFO_INC( super, bread_miss ) );
114 --- linux/fs/reiserfs/journal.c.orig Tue Feb 5 13:51:53 2002
115 +++ linux/fs/reiserfs/journal.c Tue Feb 5 13:52:12 2002
118 bn = allocate_bitmap_node(p_s_sb) ;
120 - current->policy |= SCHED_YIELD ;
126 --- linux/fs/jffs2/background.c.orig Tue Feb 5 13:51:47 2002
127 +++ linux/fs/jffs2/background.c Tue Feb 5 13:52:12 2002
130 sprintf(current->comm, "jffs2_gcd_mtd%d", c->mtd->index);
132 - /* FIXME in the 2.2 backport */
133 - current->nice = 10;
136 spin_lock_irq(¤t->sigmask_lock);
137 siginitsetinv (¤t->blocked, sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT));
138 --- linux/fs/jbd/journal.c.orig Tue Feb 5 13:51:53 2002
139 +++ linux/fs/jbd/journal.c Tue Feb 5 13:52:12 2002
141 printk (KERN_NOTICE __FUNCTION__
142 ": ENOMEM at get_unused_buffer_head, "
144 - current->policy |= SCHED_YIELD;
149 /* keep subsequent assertions sane */
150 @@ -1541,8 +1540,7 @@
151 last_warning = jiffies;
154 - current->policy |= SCHED_YIELD;
160 @@ -1600,8 +1598,7 @@
161 last_warning = jiffies;
164 - current->policy |= SCHED_YIELD;
167 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
170 --- linux/fs/jbd/revoke.c.orig Tue Feb 5 13:51:53 2002
171 +++ linux/fs/jbd/revoke.c Tue Feb 5 13:52:12 2002
173 if (!journal_oom_retry)
175 jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n");
176 - current->policy |= SCHED_YIELD;
182 --- linux/fs/jbd/transaction.c.orig Tue Feb 5 13:51:53 2002
183 +++ linux/fs/jbd/transaction.c Tue Feb 5 13:52:12 2002
184 @@ -1379,8 +1379,7 @@
186 old_handle_count = transaction->t_handle_count;
187 set_current_state(TASK_RUNNING);
188 - current->policy |= SCHED_YIELD;
191 } while (old_handle_count != transaction->t_handle_count);
194 --- linux/fs/binfmt_elf.c.orig Tue Feb 5 13:51:53 2002
195 +++ linux/fs/binfmt_elf.c Tue Feb 5 13:52:12 2002
196 @@ -1135,7 +1135,7 @@
198 psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i];
199 psinfo.pr_zomb = psinfo.pr_sname == 'Z';
200 - psinfo.pr_nice = current->nice;
201 + psinfo.pr_nice = task_nice(current);
202 psinfo.pr_flag = current->flags;
203 psinfo.pr_uid = NEW_TO_OLD_UID(current->uid);
204 psinfo.pr_gid = NEW_TO_OLD_GID(current->gid);
205 --- linux/fs/buffer.c.orig Tue Feb 5 13:51:53 2002
206 +++ linux/fs/buffer.c Tue Feb 5 13:52:12 2002
209 try_to_free_pages(zone, GFP_NOFS, 0);
210 run_task_queue(&tq_disk);
211 - current->policy |= SCHED_YIELD;
212 __set_current_state(TASK_RUNNING);
217 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
218 --- linux/fs/locks.c.orig Tue Feb 5 13:51:45 2002
219 +++ linux/fs/locks.c Tue Feb 5 13:52:12 2002
221 /* Let the blocked process remove waiter from the
222 * block list when it gets scheduled.
224 - current->policy |= SCHED_YIELD;
228 /* Remove waiter from the block list, because by the
229 * time it wakes up blocker won't exist any more.
230 --- linux/init/main.c.orig Tue Feb 5 13:51:53 2002
231 +++ linux/init/main.c Tue Feb 5 13:52:12 2002
233 extern void setup_arch(char **);
234 extern void cpu_idle(void);
236 -unsigned long wait_init_idle;
240 #ifdef CONFIG_X86_LOCAL_APIC
241 @@ -495,34 +493,24 @@
242 APIC_init_uniprocessor();
245 -#define smp_init() do { } while (0)
246 +#define smp_init() do { } while (0)
252 /* Called by boot processor to activate the rest. */
253 static void __init smp_init(void)
255 /* Get other processors into their bootup holding patterns. */
257 - wait_init_idle = cpu_online_map;
258 - clear_bit(current->processor, &wait_init_idle); /* Don't wait on me! */
263 - /* Wait for the other cpus to set up their idle processes */
264 - printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle);
265 - while (wait_init_idle) {
269 - printk("All processors have done init_idle\n");
276 * We need to finalize in a non-__init function or else race conditions
277 * between the root thread and the init thread may cause start_kernel to
280 kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
282 - current->need_resched = 1;
289 * Activate the first processor.
290 @@ -617,14 +604,18 @@
295 printk("POSIX conformance testing by UNIFIX\n");
298 - * We count on the initial thread going ok
299 - * Like idlers init is an unlocked kernel thread, which will
300 - * make syscalls (and thus be locked).
301 + init_idle(current, smp_processor_id());
303 + * We count on the initial thread going ok
304 + * Like idlers init is an unlocked kernel thread, which will
305 + * make syscalls (and thus be locked).
309 + /* Do the rest non-__init'ed, we're now alive */
316 pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD);
318 - while (pid != wait(&i)) {
319 - current->policy |= SCHED_YIELD;
324 + while (pid != wait(&i))
326 if (MAJOR(real_root_dev) != RAMDISK_MAJOR
327 || MINOR(real_root_dev) != 0) {
328 error = change_root(real_root_dev,"/initrd");
329 --- linux/kernel/sched.c.orig Tue Feb 5 13:51:51 2002
330 +++ linux/kernel/sched.c Tue Feb 5 13:52:12 2002
331 @@ -12,333 +12,306 @@
332 * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar
336 - * 'sched.c' is the main kernel file. It contains scheduling primitives
337 - * (sleep_on, wakeup, schedule etc) as well as a number of simple system
338 - * call functions (type getpid()), which just extract a field from
342 -#include <linux/config.h>
343 #include <linux/mm.h>
344 +#include <linux/nmi.h>
345 #include <linux/init.h>
346 +#include <asm/uaccess.h>
347 #include <linux/smp_lock.h>
348 -#include <linux/nmi.h>
349 #include <linux/interrupt.h>
350 -#include <linux/kernel_stat.h>
351 -#include <linux/completion.h>
352 -#include <linux/prefetch.h>
353 -#include <linux/compiler.h>
355 -#include <asm/uaccess.h>
356 #include <asm/mmu_context.h>
358 -extern void timer_bh(void);
359 -extern void tqueue_bh(void);
360 -extern void immediate_bh(void);
361 +#include <linux/kernel_stat.h>
364 - * scheduler variables
365 + * Priority of a process goes from 0 to 139. The 0-99
366 + * priority range is allocated to RT tasks, the 100-139
367 + * range is for SCHED_OTHER tasks. Priority values are
368 + * inverted: lower p->prio value means higher priority.
371 -unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
373 -extern void mem_use(void);
374 +#define MAX_RT_PRIO 100
375 +#define MAX_PRIO (MAX_RT_PRIO + 40)
378 - * Scheduling quanta.
380 - * NOTE! The unix "nice" value influences how long a process
381 - * gets. The nice value ranges from -20 to +19, where a -20
382 - * is a "high-priority" task, and a "+10" is a low-priority
385 - * We want the time-slice to be around 50ms or so, so this
386 - * calculation depends on the value of HZ.
387 + * Convert user-nice values [ -20 ... 0 ... 19 ]
388 + * to static priority [ 100 ... 139 (MAX_PRIO-1) ],
392 -#define TICK_SCALE(x) ((x) >> 2)
394 -#define TICK_SCALE(x) ((x) >> 1)
396 -#define TICK_SCALE(x) (x)
398 -#define TICK_SCALE(x) ((x) << 1)
400 -#define TICK_SCALE(x) ((x) << 2)
403 -#define NICE_TO_TICKS(nice) (TICK_SCALE(20-(nice))+1)
404 +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
405 +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
406 +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
409 + * 'User priority' is the nice value converted to something we
410 + * can work with better when scaling various scheduler parameters,
411 + * it's a [ 0 ... 39 ] range.
413 +#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
414 +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
415 +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
418 - * Init task must be ok at boot for the ix86 as we will check its signals
419 - * via the SMP irq return path.
420 + * These are the 'tuning knobs' of the scheduler:
422 + * Minimum timeslice is 10 msecs, default timeslice is 150 msecs,
423 + * maximum timeslice is 300 msecs. Timeslices get refilled after
427 -struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
428 +#define MIN_TIMESLICE ( 10 * HZ / 1000)
429 +#define MAX_TIMESLICE (300 * HZ / 1000)
430 +#define CHILD_PENALTY 95
431 +#define PARENT_PENALTY 100
432 +#define EXIT_WEIGHT 3
433 +#define PRIO_BONUS_RATIO 25
434 +#define INTERACTIVE_DELTA 2
435 +#define MAX_SLEEP_AVG (2*HZ)
436 +#define STARVATION_LIMIT (2*HZ)
439 - * The tasklist_lock protects the linked list of processes.
440 + * If a task is 'interactive' then we reinsert it in the active
441 + * array after it has expired its current timeslice. (it will not
442 + * continue to run immediately, it will still roundrobin with
443 + * other interactive tasks.)
445 - * The runqueue_lock locks the parts that actually access
446 - * and change the run-queues, and have to be interrupt-safe.
447 + * This part scales the interactivity limit depending on niceness.
449 - * If both locks are to be concurrently held, the runqueue_lock
450 - * nests inside the tasklist_lock.
451 + * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
452 + * Here are a few examples of different nice levels:
454 - * task->alloc_lock nests inside tasklist_lock.
455 + * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
456 + * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
457 + * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
458 + * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
459 + * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
461 + * (the X axis represents the possible -5 ... 0 ... +5 dynamic
462 + * priority range a task can explore, a value of '1' means the
463 + * task is rated interactive.)
465 + * Ie. nice +19 tasks can never get 'interactive' enough to be
466 + * reinserted into the active array. And only heavily CPU-hog nice -20
467 + * tasks will be expired. Default nice 0 tasks are somewhere between,
468 + * it takes some effort for them to get interactive, but it's not
471 -spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */
472 -rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */
474 -static LIST_HEAD(runqueue_head);
475 +#define SCALE(v1,v1_max,v2_max) \
476 + (v1) * (v2_max) / (v1_max)
479 - * We align per-CPU scheduling data on cacheline boundaries,
480 - * to prevent cacheline ping-pong.
483 - struct schedule_data {
484 - struct task_struct * curr;
485 - cycles_t last_schedule;
487 - char __pad [SMP_CACHE_BYTES];
488 -} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
490 + (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
493 -#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
494 -#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
495 +#define TASK_INTERACTIVE(p) \
496 + ((p)->prio <= (p)->static_prio - DELTA(p))
498 -struct kernel_stat kstat;
499 -extern struct task_struct *child_reaper;
501 + * TASK_TIMESLICE scales user-nice values [ -20 ... 19 ]
502 + * to time slice values.
504 + * The higher a process's priority, the bigger timeslices
505 + * it gets during one round of execution. But even the lowest
506 + * priority process gets MIN_TIMESLICE worth of execution time.
510 +#define TASK_TIMESLICE(p) (MIN_TIMESLICE + \
511 + ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/39))
513 -#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
514 -#define can_schedule(p,cpu) \
515 - ((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu))
517 + * These are the runqueue data structures:
521 +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
523 -#define idle_task(cpu) (&init_task)
524 -#define can_schedule(p,cpu) (1)
525 +typedef struct runqueue runqueue_t;
529 -void scheduling_functions_start_here(void) { }
534 + unsigned long bitmap[BITMAP_SIZE];
535 + list_t queue[MAX_PRIO];
539 - * This is the function that decides how desirable a process is..
540 - * You can weigh different processes against each other depending
541 - * on what CPU they've run on lately etc to try to handle cache
542 - * and TLB miss penalties.
543 + * This is the main, per-CPU runqueue data structure.
546 - * -1000: never select this
547 - * 0: out of time, recalculate counters (but it might still be
549 - * +ve: "goodness" value (the larger, the better)
550 - * +1000: realtime process, select this.
551 + * Locking rule: those places that want to lock multiple runqueues
552 + * (such as the load balancing or the process migration code), lock
553 + * acquire operations must be ordered by ascending &runqueue.
557 + unsigned long nr_running, nr_switches, expired_timestamp;
558 + task_t *curr, *idle;
559 + prio_array_t *active, *expired, arrays[2];
560 + int prev_nr_running[NR_CPUS];
561 +} ____cacheline_aligned;
563 -static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
566 +static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
569 - * select the current process after every other
570 - * runnable process, but before the idle thread.
571 - * Also, dont trigger a counter recalculation.
574 - if (p->policy & SCHED_YIELD)
576 +#define cpu_rq(cpu) (runqueues + (cpu))
577 +#define this_rq() cpu_rq(smp_processor_id())
578 +#define task_rq(p) cpu_rq((p)->cpu)
579 +#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
580 +#define rt_task(p) ((p)->prio < MAX_RT_PRIO)
583 - * Non-RT process - normal case first.
585 - if (p->policy == SCHED_OTHER) {
587 - * Give the process a first-approximation goodness value
588 - * according to the number of clock-ticks it has left.
590 - * Don't do any other calculations if the time slice is
593 - weight = p->counter;
598 - /* Give a largish advantage to the same processor... */
599 - /* (this is equivalent to penalizing other processors) */
600 - if (p->processor == this_cpu)
601 - weight += PROC_CHANGE_PENALTY;
603 +static inline runqueue_t *lock_task_rq(task_t *p, unsigned long *flags)
605 + struct runqueue *__rq;
607 - /* .. and a slight advantage to the current MM */
608 - if (p->mm == this_mm || !p->mm)
610 - weight += 20 - p->nice;
614 + spin_lock_irqsave(&__rq->lock, *flags);
615 + if (unlikely(__rq != task_rq(p))) {
616 + spin_unlock_irqrestore(&__rq->lock, *flags);
617 + goto repeat_lock_task;
623 - * Realtime process, select the first one on the
624 - * runqueue (taking priorities within processes
627 - weight = 1000 + p->rt_priority;
630 +static inline void unlock_task_rq(runqueue_t *rq, unsigned long *flags)
632 + spin_unlock_irqrestore(&rq->lock, *flags);
636 - * the 'goodness value' of replacing a process on a given CPU.
637 - * positive value means 'replace', zero or negative means 'dont'.
638 + * Adding/removing a task to/from a priority array:
640 -static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
641 +static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
643 - return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
644 + array->nr_active--;
645 + list_del_init(&p->run_list);
646 + if (list_empty(array->queue + p->prio))
647 + __clear_bit(p->prio, array->bitmap);
651 - * This is ugly, but reschedule_idle() is very timing-critical.
652 - * We are called with the runqueue spinlock held and we must
653 - * not claim the tasklist_lock.
655 -static FASTCALL(void reschedule_idle(struct task_struct * p));
656 +static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
658 + list_add_tail(&p->run_list, array->queue + p->prio);
659 + __set_bit(p->prio, array->bitmap);
660 + array->nr_active++;
664 -static void reschedule_idle(struct task_struct * p)
665 +static inline int effective_prio(task_t *p)
668 - int this_cpu = smp_processor_id();
669 - struct task_struct *tsk, *target_tsk;
670 - int cpu, best_cpu, i, max_prio;
671 - cycles_t oldest_idle;
674 - * shortcut if the woken up task's last CPU is
677 - best_cpu = p->processor;
678 - if (can_schedule(p, best_cpu)) {
679 - tsk = idle_task(best_cpu);
680 - if (cpu_curr(best_cpu) == tsk) {
684 - * If need_resched == -1 then we can skip sending
685 - * the IPI altogether, tsk->need_resched is
686 - * actively watched by the idle thread.
688 - need_resched = tsk->need_resched;
689 - tsk->need_resched = 1;
690 - if ((best_cpu != this_cpu) && !need_resched)
691 - smp_send_reschedule(best_cpu);
698 - * We know that the preferred CPU has a cache-affine current
699 - * process, lets try to find a new idle CPU for the woken-up
700 - * process. Select the least recently active idle CPU. (that
701 - * one will have the least active cache context.) Also find
702 - * the executing process which has the least priority.
704 - oldest_idle = (cycles_t) -1;
707 + * Here we scale the actual sleep average [0 .... MAX_SLEEP_AVG]
708 + * into the -5 ... 0 ... +5 bonus/penalty range.
710 + * We use 25% of the full 0...39 priority range so that:
712 + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
713 + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
715 + * Both properties are important to certain workloads.
717 + bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
718 + MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
720 - for (i = 0; i < smp_num_cpus; i++) {
721 - cpu = cpu_logical_map(i);
722 - if (!can_schedule(p, cpu))
724 - tsk = cpu_curr(cpu);
725 + prio = p->static_prio - bonus;
726 + if (prio < MAX_RT_PRIO)
727 + prio = MAX_RT_PRIO;
728 + if (prio > MAX_PRIO-1)
733 +static inline void activate_task(task_t *p, runqueue_t *rq)
735 + unsigned long sleep_time = jiffies - p->sleep_timestamp;
736 + prio_array_t *array = rq->active;
738 + if (!rt_task(p) && sleep_time) {
740 - * We use the first available idle CPU. This creates
741 - * a priority list between idle CPUs, but this is not
743 + * This code gives a bonus to interactive tasks. We update
744 + * an 'average sleep time' value here, based on
745 + * sleep_timestamp. The more time a task spends sleeping,
746 + * the higher the average gets - and the higher the priority
747 + * boost gets as well.
749 - if (tsk == idle_task(cpu)) {
750 -#if defined(__i386__) && defined(CONFIG_SMP)
752 - * Check if two siblings are idle in the same
753 - * physical package. Use them if found.
755 - if (smp_num_siblings == 2) {
756 - if (cpu_curr(cpu_sibling_map[cpu]) ==
757 - idle_task(cpu_sibling_map[cpu])) {
758 - oldest_idle = last_schedule(cpu);
765 - if (last_schedule(cpu) < oldest_idle) {
766 - oldest_idle = last_schedule(cpu);
770 - if (oldest_idle == -1ULL) {
771 - int prio = preemption_goodness(tsk, p, cpu);
773 - if (prio > max_prio) {
782 - if (oldest_idle != -1ULL) {
783 - best_cpu = tsk->processor;
784 - goto send_now_idle;
786 - tsk->need_resched = 1;
787 - if (tsk->processor != this_cpu)
788 - smp_send_reschedule(tsk->processor);
789 + p->sleep_avg += sleep_time;
790 + if (p->sleep_avg > MAX_SLEEP_AVG)
791 + p->sleep_avg = MAX_SLEEP_AVG;
792 + p->prio = effective_prio(p);
796 + enqueue_task(p, array);
801 - int this_cpu = smp_processor_id();
802 - struct task_struct *tsk;
804 - tsk = cpu_curr(this_cpu);
805 - if (preemption_goodness(tsk, p, this_cpu) > 0)
806 - tsk->need_resched = 1;
808 +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
811 + dequeue_task(p, p->array);
815 +static inline void resched_task(task_t *p)
819 + need_resched = p->need_resched;
821 + p->need_resched = 1;
822 + if (!need_resched && (p->cpu != smp_processor_id()))
823 + smp_send_reschedule(p->cpu);
831 - * This has to add the process to the _beginning_ of the
832 - * run-queue, not the end. See the comment about "This is
833 - * subtle" in the scheduler proper..
834 + * Wait for a process to unschedule. This is used by the exit() and
837 -static inline void add_to_runqueue(struct task_struct * p)
838 +void wait_task_inactive(task_t * p)
840 - list_add(&p->run_list, &runqueue_head);
842 + unsigned long flags;
847 + while (unlikely(rq->curr == p)) {
851 + rq = lock_task_rq(p, &flags);
852 + if (unlikely(rq->curr == p)) {
853 + unlock_task_rq(rq, &flags);
856 + unlock_task_rq(rq, &flags);
859 -static inline void move_last_runqueue(struct task_struct * p)
861 + * The SMP message passing code calls this function whenever
862 + * the new task has arrived at the target CPU. We move the
863 + * new task into the local runqueue.
865 + * This function must be called with interrupts disabled.
867 +void sched_task_migrated(task_t *new_task)
869 - list_del(&p->run_list);
870 - list_add_tail(&p->run_list, &runqueue_head);
871 + wait_task_inactive(new_task);
872 + new_task->cpu = smp_processor_id();
873 + wake_up_process(new_task);
876 -static inline void move_first_runqueue(struct task_struct * p)
878 + * Kick the remote CPU if the task is running currently,
879 + * this code is used by the signal code to signal tasks
880 + * which are in user-mode as quickly as possible.
882 + * (Note that we do this lockless - if the task does anything
883 + * while the message is in flight then it will notice the
884 + * sigpending condition anyway.)
886 +void kick_if_running(task_t * p)
888 - list_del(&p->run_list);
889 - list_add(&p->run_list, &runqueue_head);
890 + if (p == task_rq(p)->curr)
896 * Wake up a process. Put it on the run-queue if it's not
897 @@ -348,392 +321,528 @@
898 * "current->state = TASK_RUNNING" to mark yourself runnable
899 * without the overhead of this.
901 -static inline int try_to_wake_up(struct task_struct * p, int synchronous)
902 +static int try_to_wake_up(task_t * p, int synchronous)
909 - * We want the common case fall through straight, thus the goto.
911 - spin_lock_irqsave(&runqueue_lock, flags);
912 + rq = lock_task_rq(p, &flags);
913 p->state = TASK_RUNNING;
914 - if (task_on_runqueue(p))
916 - add_to_runqueue(p);
917 - if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
918 - reschedule_idle(p);
921 - spin_unlock_irqrestore(&runqueue_lock, flags);
923 + activate_task(p, rq);
924 + if ((rq->curr == rq->idle) || (p->prio < rq->curr->prio))
925 + resched_task(rq->curr);
928 + unlock_task_rq(rq, &flags);
932 -inline int wake_up_process(struct task_struct * p)
933 +int wake_up_process(task_t * p)
935 return try_to_wake_up(p, 0);
938 -static void process_timeout(unsigned long __data)
939 +void wake_up_forked_process(task_t * p)
941 - struct task_struct * p = (struct task_struct *) __data;
942 + runqueue_t *rq = this_rq();
944 - wake_up_process(p);
945 + p->state = TASK_RUNNING;
948 + * We decrease the sleep average of forking parents
949 + * and children as well, to keep max-interactive tasks
950 + * from forking tasks that are max-interactive.
952 + current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
953 + p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
954 + p->prio = effective_prio(p);
956 + spin_lock_irq(&rq->lock);
957 + p->cpu = smp_processor_id();
958 + activate_task(p, rq);
959 + spin_unlock_irq(&rq->lock);
963 - * schedule_timeout - sleep until timeout
964 - * @timeout: timeout value in jiffies
966 - * Make the current task sleep until @timeout jiffies have
967 - * elapsed. The routine will return immediately unless
968 - * the current task state has been set (see set_current_state()).
970 - * You can set the task state as follows -
972 - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
973 - * pass before the routine returns. The routine will return 0
975 - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
976 - * delivered to the current task. In this case the remaining time
977 - * in jiffies will be returned, or 0 if the timer expired in time
979 - * The current task state is guaranteed to be TASK_RUNNING when this
982 - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
983 - * the CPU away without a bound on the timeout. In this case the return
984 - * value will be %MAX_SCHEDULE_TIMEOUT.
986 + * Potentially available exiting-child timeslices are
987 + * retrieved here - this way the parent does not get
988 + * penalized for creating too many processes.
990 - * In all cases the return value is guaranteed to be non-negative.
991 + * (this cannot be used to 'generate' timeslices
992 + * artificially, because any timeslice recovered here
993 + * was given away by the parent in the first place.)
995 -signed long schedule_timeout(signed long timeout)
996 +void sched_exit(task_t * p)
998 - struct timer_list timer;
999 - unsigned long expire;
1001 + current->time_slice += p->time_slice;
1002 + if (unlikely(current->time_slice > MAX_TIMESLICE))
1003 + current->time_slice = MAX_TIMESLICE;
1006 + * If the child was a (relative-) CPU hog then decrease
1007 + * the sleep_avg of the parent as well.
1009 + if (p->sleep_avg < current->sleep_avg)
1010 + current->sleep_avg = (current->sleep_avg * EXIT_WEIGHT +
1011 + p->sleep_avg) / (EXIT_WEIGHT + 1);
1016 - case MAX_SCHEDULE_TIMEOUT:
1018 - * These two special cases are useful to be comfortable
1019 - * in the caller. Nothing more. We could take
1020 - * MAX_SCHEDULE_TIMEOUT from one of the negative value
1021 - * but I' d like to return a valid offset (>=0) to allow
1022 - * the caller to do everything it want with the retval.
1028 - * Another bit of PARANOID. Note that the retval will be
1029 - * 0 since no piece of kernel is supposed to do a check
1030 - * for a negative retval of schedule_timeout() (since it
1031 - * should never happens anyway). You just have the printk()
1032 - * that will tell you if something is gone wrong and where.
1036 - printk(KERN_ERR "schedule_timeout: wrong timeout "
1037 - "value %lx from %p\n", timeout,
1038 - __builtin_return_address(0));
1039 - current->state = TASK_RUNNING;
1044 +asmlinkage void schedule_tail(task_t *prev)
1046 + spin_unlock_irq(&this_rq()->lock);
1050 - expire = timeout + jiffies;
1051 +static inline void context_switch(task_t *prev, task_t *next)
1053 + struct mm_struct *mm = next->mm;
1054 + struct mm_struct *oldmm = prev->active_mm;
1056 - init_timer(&timer);
1057 - timer.expires = expire;
1058 - timer.data = (unsigned long) current;
1059 - timer.function = process_timeout;
1060 + prepare_to_switch();
1062 - add_timer(&timer);
1064 - del_timer_sync(&timer);
1065 + if (unlikely(!mm)) {
1066 + next->active_mm = oldmm;
1067 + atomic_inc(&oldmm->mm_count);
1068 + enter_lazy_tlb(oldmm, next, smp_processor_id());
1070 + switch_mm(oldmm, mm, next, smp_processor_id());
1072 - timeout = expire - jiffies;
1073 + if (unlikely(!prev->mm)) {
1074 + prev->active_mm = NULL;
1079 - return timeout < 0 ? 0 : timeout;
1081 + * Here we just switch the register state and the stack. There are
1082 + * 3 processes affected by a context switch:
1084 + * prev ==> .... ==> (last => next)
1086 + * It's the 'much more previous' 'prev' that is on next's stack,
1087 + * but prev is set to (the just run) 'last' process by switch_to().
1088 + * This might sound slightly confusing but makes tons of sense.
1090 + switch_to(prev, next, prev);
1094 - * schedule_tail() is getting called from the fork return path. This
1095 - * cleans up all remaining scheduler things, without impacting the
1098 -static inline void __schedule_tail(struct task_struct *prev)
1099 +unsigned long nr_running(void)
1105 - * prev->policy can be written from here only before `prev'
1106 - * can be scheduled (before setting prev->cpus_runnable to ~0UL).
1107 - * Of course it must also be read before allowing prev
1108 - * to be rescheduled, but since the write depends on the read
1109 - * to complete, wmb() is enough. (the spin_lock() acquired
1110 - * before setting cpus_runnable is not enough because the spin_lock()
1111 - * common code semantics allows code outside the critical section
1112 - * to enter inside the critical section)
1114 - policy = prev->policy;
1115 - prev->policy = policy & ~SCHED_YIELD;
1117 + unsigned long i, sum = 0;
1120 - * fast path falls through. We have to clear cpus_runnable before
1121 - * checking prev->state to avoid a wakeup race. Protect against
1122 - * the task exiting early.
1125 - task_release_cpu(prev);
1127 - if (prev->state == TASK_RUNNING)
1128 - goto needs_resched;
1129 + for (i = 0; i < smp_num_cpus; i++)
1130 + sum += cpu_rq(cpu_logical_map(i))->nr_running;
1133 - task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */
1139 - * Slow path - we 'push' the previous process and
1140 - * reschedule_idle() will attempt to find a new
1141 - * processor for it. (but it might preempt the
1142 - * current process as well.) We must take the runqueue
1143 - * lock and re-check prev->state to be correct. It might
1144 - * still happen that this process has a preemption
1145 - * 'in progress' already - but this is not a problem and
1146 - * might happen in other circumstances as well.
1150 - unsigned long flags;
1151 +unsigned long nr_context_switches(void)
1153 + unsigned long i, sum = 0;
1156 - * Avoid taking the runqueue lock in cases where
1157 - * no preemption-check is necessery:
1159 - if ((prev == idle_task(smp_processor_id())) ||
1160 - (policy & SCHED_YIELD))
1162 + for (i = 0; i < smp_num_cpus; i++)
1163 + sum += cpu_rq(cpu_logical_map(i))->nr_switches;
1165 - spin_lock_irqsave(&runqueue_lock, flags);
1166 - if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev))
1167 - reschedule_idle(prev);
1168 - spin_unlock_irqrestore(&runqueue_lock, flags);
1172 - prev->policy &= ~SCHED_YIELD;
1173 -#endif /* CONFIG_SMP */
1177 -asmlinkage void schedule_tail(struct task_struct *prev)
1180 + * Lock the busiest runqueue as well, this_rq is locked already.
1181 + * Recalculate nr_running if we have to drop the runqueue lock.
1183 +static inline unsigned int double_lock_balance(runqueue_t *this_rq,
1184 + runqueue_t *busiest, int this_cpu, int idle, unsigned int nr_running)
1186 - __schedule_tail(prev);
1187 + if (unlikely(!spin_trylock(&busiest->lock))) {
1188 + if (busiest < this_rq) {
1189 + spin_unlock(&this_rq->lock);
1190 + spin_lock(&busiest->lock);
1191 + spin_lock(&this_rq->lock);
1192 + /* Need to recalculate nr_running */
1193 + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
1194 + nr_running = this_rq->nr_running;
1196 + nr_running = this_rq->prev_nr_running[this_cpu];
1198 + spin_lock(&busiest->lock);
1200 + return nr_running;
1204 - * 'schedule()' is the scheduler function. It's a very simple and nice
1205 - * scheduler: it's not perfect, but certainly works for most things.
1207 - * The goto is "interesting".
1208 + * Current runqueue is empty, or rebalance tick: if there is an
1209 + * inbalance (current runqueue is too short) then pull from
1210 + * busiest runqueue(s).
1212 - * NOTE!! Task 0 is the 'idle' task, which gets called when no other
1213 - * tasks can run. It can not be killed, and it cannot sleep. The 'state'
1214 - * information in task[0] is never used.
1215 + * We call this with the current runqueue locked,
1218 -asmlinkage void schedule(void)
1219 +static void load_balance(runqueue_t *this_rq, int idle)
1221 - struct schedule_data * sched_data;
1222 - struct task_struct *prev, *next, *p;
1223 - struct list_head *tmp;
1225 + int imbalance, nr_running, load, max_load,
1226 + idx, i, this_cpu = smp_processor_id();
1227 + task_t *next = this_rq->idle, *tmp;
1228 + runqueue_t *busiest, *rq_src;
1229 + prio_array_t *array;
1230 + list_t *head, *curr;
1233 + * We search all runqueues to find the most busy one.
1234 + * We do this lockless to reduce cache-bouncing overhead,
1235 + * we re-check the 'best' source CPU later on again, with
1238 + * We fend off statistical fluctuations in runqueue lengths by
1239 + * saving the runqueue length during the previous load-balancing
1240 + * operation and using the smaller one the current and saved lengths.
1241 + * If a runqueue is long enough for a longer amount of time then
1242 + * we recognize it and pull tasks from it.
1244 + * The 'current runqueue length' is a statistical maximum variable,
1245 + * for that one we take the longer one - to avoid fluctuations in
1246 + * the other direction. So for a load-balance to happen it needs
1247 + * stable long runqueue on the target CPU and stable short runqueue
1248 + * on the local runqueue.
1250 + * We make an exception if this CPU is about to become idle - in
1251 + * that case we are less picky about moving a task across CPUs and
1252 + * take what can be taken.
1254 + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
1255 + nr_running = this_rq->nr_running;
1257 + nr_running = this_rq->prev_nr_running[this_cpu];
1259 - spin_lock_prefetch(&runqueue_lock);
1262 + for (i = 0; i < smp_num_cpus; i++) {
1263 + rq_src = cpu_rq(cpu_logical_map(i));
1264 + if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i]))
1265 + load = rq_src->nr_running;
1267 + load = this_rq->prev_nr_running[i];
1268 + this_rq->prev_nr_running[i] = rq_src->nr_running;
1270 + if ((load > max_load) && (rq_src != this_rq)) {
1276 - if (!current->active_mm) BUG();
1279 - this_cpu = prev->processor;
1280 + if (likely(!busiest))
1283 - if (unlikely(in_interrupt())) {
1284 - printk("Scheduling in interrupt\n");
1287 + imbalance = (max_load - nr_running) / 2;
1289 - release_kernel_lock(prev, this_cpu);
1290 + /* It needs an at least ~25% imbalance to trigger balancing. */
1291 + if (!idle && (imbalance < (max_load + 3)/4))
1294 + nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running);
1296 - * 'sched_data' is protected by the fact that we can run
1297 - * only one process per CPU.
1298 + * Make sure nothing changed since we checked the
1299 + * runqueue length.
1301 - sched_data = & aligned_data[this_cpu].schedule_data;
1302 + if (busiest->nr_running <= this_rq->nr_running + 1)
1305 - spin_lock_irq(&runqueue_lock);
1307 + * We first consider expired tasks. Those will likely not be
1308 + * executed in the near future, and they are most likely to
1309 + * be cache-cold, thus switching CPUs has the least effect
1312 + if (busiest->expired->nr_active)
1313 + array = busiest->expired;
1315 + array = busiest->active;
1317 - /* move an exhausted RR process to be last.. */
1318 - if (unlikely(prev->policy == SCHED_RR))
1319 - if (!prev->counter) {
1320 - prev->counter = NICE_TO_TICKS(prev->nice);
1321 - move_last_runqueue(prev);
1323 + /* Start searching at priority 0: */
1327 + idx = sched_find_first_bit(array->bitmap);
1329 + idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1330 + if (idx == MAX_PRIO) {
1331 + if (array == busiest->expired) {
1332 + array = busiest->active;
1336 - switch (prev->state) {
1337 - case TASK_INTERRUPTIBLE:
1338 - if (signal_pending(prev)) {
1339 - prev->state = TASK_RUNNING;
1343 - del_from_runqueue(prev);
1344 - case TASK_RUNNING:;
1347 - prev->need_resched = 0;
1350 - * this is the scheduler proper:
1355 - * Default process to select..
1357 - next = idle_task(this_cpu);
1359 - list_for_each(tmp, &runqueue_head) {
1360 - p = list_entry(tmp, struct task_struct, run_list);
1361 - if (can_schedule(p, this_cpu)) {
1362 - int weight = goodness(p, this_cpu, prev->active_mm);
1364 - c = weight, next = p;
1365 + head = array->queue + idx;
1366 + curr = head->prev;
1368 + tmp = list_entry(curr, task_t, run_list);
1371 + * We do not migrate tasks that are:
1372 + * 1) running (obviously), or
1373 + * 2) cannot be migrated to this CPU due to cpus_allowed, or
1374 + * 3) are cache-hot on their current CPU.
1377 +#define CAN_MIGRATE_TASK(p,rq,this_cpu) \
1378 + ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \
1379 + ((p) != (rq)->curr) && \
1380 + (tmp->cpus_allowed & (1 << (this_cpu))))
1382 + if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
1383 + curr = curr->next;
1391 + * take the task out of the other runqueue and
1392 + * put it into this one:
1394 + dequeue_task(next, array);
1395 + busiest->nr_running--;
1396 + next->cpu = this_cpu;
1397 + this_rq->nr_running++;
1398 + enqueue_task(next, this_rq->active);
1399 + if (next->prio < current->prio)
1400 + current->need_resched = 1;
1401 + if (!idle && --imbalance) {
1402 + if (array == busiest->expired) {
1403 + array = busiest->active;
1408 + spin_unlock(&busiest->lock);
1412 + * One of the idle_cpu_tick() or the busy_cpu_tick() function will
1413 + * gets called every timer tick, on every CPU. Our balancing action
1414 + * frequency and balancing agressivity depends on whether the CPU is
1417 + * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on
1418 + * systems with HZ=100, every 10 msecs.)
1420 +#define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
1421 +#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
1423 +static inline void idle_tick(void)
1425 + if (jiffies % IDLE_REBALANCE_TICK)
1427 + spin_lock(&this_rq()->lock);
1428 + load_balance(this_rq(), 1);
1429 + spin_unlock(&this_rq()->lock);
1434 - /* Do we need to re-calculate counters? */
1435 - if (unlikely(!c)) {
1436 - struct task_struct *p;
1438 - spin_unlock_irq(&runqueue_lock);
1439 - read_lock(&tasklist_lock);
1441 - p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
1442 - read_unlock(&tasklist_lock);
1443 - spin_lock_irq(&runqueue_lock);
1444 - goto repeat_schedule;
1446 + * We place interactive tasks back into the active array, if possible.
1448 + * To guarantee that this does not starve expired tasks we ignore the
1449 + * interactivity of a task if the first expired task had to wait more
1450 + * than a 'reasonable' amount of time. This deadline timeout is
1451 + * load-dependent, as the frequency of array switched decreases with
1452 + * increasing number of running tasks:
1454 +#define EXPIRED_STARVING(rq) \
1455 + ((rq)->expired_timestamp && \
1456 + (jiffies - (rq)->expired_timestamp >= \
1457 + STARVATION_LIMIT * ((rq)->nr_running) + 1))
1460 + * This function gets called by the timer code, with HZ frequency.
1461 + * We call it with interrupts disabled.
1463 +void scheduler_tick(int user_tick, int system)
1465 + int cpu = smp_processor_id();
1466 + runqueue_t *rq = this_rq();
1467 + task_t *p = current;
1469 + if (p == rq->idle) {
1470 + if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
1471 + kstat.per_cpu_system[cpu] += system;
1477 + if (TASK_NICE(p) > 0)
1478 + kstat.per_cpu_nice[cpu] += user_tick;
1480 + kstat.per_cpu_user[cpu] += user_tick;
1481 + kstat.per_cpu_system[cpu] += system;
1483 + /* Task might have expired already, but not scheduled off yet */
1484 + if (p->array != rq->active) {
1485 + p->need_resched = 1;
1488 + spin_lock(&rq->lock);
1489 + if (unlikely(rt_task(p))) {
1491 + * RR tasks need a special form of timeslice management.
1492 + * FIFO tasks have no timeslices.
1494 + if ((p->policy == SCHED_RR) && !--p->time_slice) {
1495 + p->time_slice = TASK_TIMESLICE(p);
1496 + p->need_resched = 1;
1498 + /* put it at the end of the queue: */
1499 + dequeue_task(p, rq->active);
1500 + enqueue_task(p, rq->active);
1505 - * from this point on nothing can prevent us from
1506 - * switching to the next task, save this fact in
1509 - sched_data->curr = next;
1510 - task_set_cpu(next, this_cpu);
1511 - spin_unlock_irq(&runqueue_lock);
1513 - if (unlikely(prev == next)) {
1514 - /* We won't go through the normal tail, so do this by hand */
1515 - prev->policy &= ~SCHED_YIELD;
1516 - goto same_process;
1517 + * The task was running during this tick - update the
1518 + * time slice counter and the sleep average. Note: we
1519 + * do not update a process's priority until it either
1520 + * goes to sleep or uses up its timeslice. This makes
1521 + * it possible for interactive tasks to use up their
1522 + * timeslices at their highest priority levels.
1526 + if (!--p->time_slice) {
1527 + dequeue_task(p, rq->active);
1528 + p->need_resched = 1;
1529 + p->prio = effective_prio(p);
1530 + p->time_slice = TASK_TIMESLICE(p);
1532 + if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
1533 + if (!rq->expired_timestamp)
1534 + rq->expired_timestamp = jiffies;
1535 + enqueue_task(p, rq->expired);
1537 + enqueue_task(p, rq->active);
1541 + if (!(jiffies % BUSY_REBALANCE_TICK))
1542 + load_balance(rq, 0);
1544 + spin_unlock(&rq->lock);
1549 - * maintain the per-process 'last schedule' value.
1550 - * (this has to be recalculated even if we reschedule to
1551 - * the same process) Currently this is only used on SMP,
1552 - * and it's approximate, so we do not have to maintain
1553 - * it while holding the runqueue spinlock.
1555 - sched_data->last_schedule = get_cycles();
1556 +void scheduling_functions_start_here(void) { }
1559 - * We drop the scheduler lock early (it's a global spinlock),
1560 - * thus we have to lock the previous process from getting
1561 - * rescheduled during switch_to().
1564 + * 'schedule()' is the main scheduler function.
1566 +asmlinkage void schedule(void)
1568 + task_t *prev = current, *next;
1569 + runqueue_t *rq = this_rq();
1570 + prio_array_t *array;
1574 -#endif /* CONFIG_SMP */
1575 + if (unlikely(in_interrupt()))
1577 + release_kernel_lock(prev, smp_processor_id());
1578 + prev->sleep_timestamp = jiffies;
1579 + spin_lock_irq(&rq->lock);
1581 - kstat.context_swtch++;
1583 - * there are 3 processes which are affected by a context switch:
1585 - * prev == .... ==> (last => next)
1587 - * It's the 'much more previous' 'prev' that is on next's stack,
1588 - * but prev is set to (the just run) 'last' process by switch_to().
1589 - * This might sound slightly confusing but makes tons of sense.
1591 - prepare_to_switch();
1593 - struct mm_struct *mm = next->mm;
1594 - struct mm_struct *oldmm = prev->active_mm;
1596 - if (next->active_mm) BUG();
1597 - next->active_mm = oldmm;
1598 - atomic_inc(&oldmm->mm_count);
1599 - enter_lazy_tlb(oldmm, next, this_cpu);
1601 - if (next->active_mm != mm) BUG();
1602 - switch_mm(oldmm, mm, next, this_cpu);
1603 + switch (prev->state) {
1604 + case TASK_INTERRUPTIBLE:
1605 + if (unlikely(signal_pending(prev))) {
1606 + prev->state = TASK_RUNNING;
1610 + deactivate_task(prev, rq);
1611 + case TASK_RUNNING:
1617 + if (unlikely(!rq->nr_running)) {
1619 + load_balance(rq, 1);
1620 + if (rq->nr_running)
1621 + goto pick_next_task;
1624 + rq->expired_timestamp = 0;
1625 + goto switch_tasks;
1629 - prev->active_mm = NULL;
1632 + array = rq->active;
1633 + if (unlikely(!array->nr_active)) {
1635 + * Switch the active and expired arrays.
1637 + rq->active = rq->expired;
1638 + rq->expired = array;
1639 + array = rq->active;
1640 + rq->expired_timestamp = 0;
1644 - * This just switches the register state and the
1647 - switch_to(prev, next, prev);
1648 - __schedule_tail(prev);
1649 + idx = sched_find_first_bit(array->bitmap);
1650 + queue = array->queue + idx;
1651 + next = list_entry(queue->next, task_t, run_list);
1655 + prev->need_resched = 0;
1657 + if (likely(prev != next)) {
1658 + rq->nr_switches++;
1660 + context_switch(prev, next);
1662 + * The runqueue pointer might be from another CPU
1663 + * if the new task was last running on a different
1664 + * CPU - thus re-load it.
1669 + spin_unlock_irq(&rq->lock);
1672 reacquire_kernel_lock(current);
1673 - if (current->need_resched)
1674 - goto need_resched_back;
1679 - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything
1680 - * up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the
1681 - * non-exclusive tasks and one exclusive task.
1682 + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
1683 + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
1684 + * number) then we wake all the non-exclusive tasks and one exclusive task.
1686 * There are circumstances in which we can try to wake a task which has already
1687 - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero
1688 - * in this (rare) case, and we handle it by contonuing to scan the queue.
1689 + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
1690 + * zero in this (rare) case, and we handle it by continuing to scan the queue.
1692 static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
1693 int nr_exclusive, const int sync)
1695 struct list_head *tmp;
1696 - struct task_struct *p;
1699 - CHECK_MAGIC_WQHEAD(q);
1700 - WQ_CHECK_LIST_HEAD(&q->task_list);
1702 list_for_each(tmp,&q->task_list) {
1704 - wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
1705 + wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
1707 - CHECK_MAGIC(curr->__magic);
1710 - if (state & mode) {
1711 - WQ_NOTE_WAKER(curr);
1712 - if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
1715 + if ((state & mode) &&
1716 + try_to_wake_up(p, sync) &&
1717 + ((curr->flags & WQ_FLAG_EXCLUSIVE) &&
1723 @@ -850,8 +959,71 @@
1728 + * Change the current task's CPU affinity. Migrate the process to a
1729 + * proper CPU and schedule away if the current CPU is removed from
1730 + * the allowed bitmask.
1732 +void set_cpus_allowed(task_t *p, unsigned long new_mask)
1734 + new_mask &= cpu_online_map;
1740 + p->cpus_allowed = new_mask;
1742 + * Can the task run on the current CPU? If not then
1743 + * migrate the process off to a proper CPU.
1745 + if (new_mask & (1UL << smp_processor_id()))
1748 + current->state = TASK_UNINTERRUPTIBLE;
1749 + smp_migrate_task(__ffs(new_mask), current);
1755 void scheduling_functions_end_here(void) { }
1757 +void set_user_nice(task_t *p, long nice)
1759 + unsigned long flags;
1760 + prio_array_t *array;
1763 + if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
1766 + * We have to be careful, if called from sys_setpriority(),
1767 + * the task might be in the middle of scheduling on another CPU.
1769 + rq = lock_task_rq(p, &flags);
1771 + p->static_prio = NICE_TO_PRIO(nice);
1776 + dequeue_task(p, array);
1777 + p->static_prio = NICE_TO_PRIO(nice);
1778 + p->prio = NICE_TO_PRIO(nice);
1780 + enqueue_task(p, array);
1782 + * If the task is running and lowered its priority,
1783 + * or increased its priority then reschedule its CPU:
1785 + if ((NICE_TO_PRIO(nice) < p->static_prio) || (p == rq->curr))
1786 + resched_task(rq->curr);
1789 + unlock_task_rq(rq, &flags);
1795 @@ -862,7 +1034,7 @@
1797 asmlinkage long sys_nice(int increment)
1803 * Setpriority might change our priority at the same moment.
1804 @@ -878,32 +1050,46 @@
1808 - newprio = current->nice + increment;
1809 - if (newprio < -20)
1813 - current->nice = newprio;
1814 + nice = PRIO_TO_NICE(current->static_prio) + increment;
1819 + set_user_nice(current, nice);
1825 -static inline struct task_struct *find_process_by_pid(pid_t pid)
1827 + * This is the priority value as seen by users in /proc
1829 + * RT tasks are offset by -200. Normal tasks are centered
1830 + * around 0, value goes from -16 to +15.
1832 +int task_prio(task_t *p)
1834 - struct task_struct *tsk = current;
1835 + return p->prio - 100;
1839 - tsk = find_task_by_pid(pid);
1841 +int task_nice(task_t *p)
1843 + return TASK_NICE(p);
1846 +static inline task_t *find_process_by_pid(pid_t pid)
1848 + return pid ? find_task_by_pid(pid) : current;
1851 -static int setscheduler(pid_t pid, int policy,
1852 - struct sched_param *param)
1853 +static int setscheduler(pid_t pid, int policy, struct sched_param *param)
1855 struct sched_param lp;
1856 - struct task_struct *p;
1857 + prio_array_t *array;
1858 + unsigned long flags;
1864 if (!param || pid < 0)
1865 @@ -917,14 +1103,19 @@
1866 * We play safe to avoid deadlocks.
1868 read_lock_irq(&tasklist_lock);
1869 - spin_lock(&runqueue_lock);
1871 p = find_process_by_pid(pid);
1877 + goto out_unlock_tasklist;
1880 + * To be able to change p->policy safely, the apropriate
1881 + * runqueue lock must be held.
1883 + rq = lock_task_rq(p, &flags);
1888 @@ -945,30 +1136,36 @@
1892 - if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
1893 + if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
1894 !capable(CAP_SYS_NICE))
1896 if ((current->euid != p->euid) && (current->euid != p->uid) &&
1897 !capable(CAP_SYS_NICE))
1902 + deactivate_task(p, task_rq(p));
1905 p->rt_priority = lp.sched_priority;
1906 - if (task_on_runqueue(p))
1907 - move_first_runqueue(p);
1909 - current->need_resched = 1;
1911 + p->prio = 99 - p->rt_priority;
1913 + p->prio = p->static_prio;
1915 + activate_task(p, task_rq(p));
1918 - spin_unlock(&runqueue_lock);
1919 + unlock_task_rq(rq, &flags);
1920 +out_unlock_tasklist:
1921 read_unlock_irq(&tasklist_lock);
1927 -asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
1928 +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
1929 struct sched_param *param)
1931 return setscheduler(pid, policy, param);
1932 @@ -981,7 +1178,7 @@
1934 asmlinkage long sys_sched_getscheduler(pid_t pid)
1936 - struct task_struct *p;
1941 @@ -992,7 +1189,7 @@
1942 read_lock(&tasklist_lock);
1943 p = find_process_by_pid(pid);
1945 - retval = p->policy & ~SCHED_YIELD;
1946 + retval = p->policy;
1947 read_unlock(&tasklist_lock);
1950 @@ -1001,7 +1198,7 @@
1952 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
1954 - struct task_struct *p;
1956 struct sched_param lp;
1959 @@ -1032,42 +1229,64 @@
1961 asmlinkage long sys_sched_yield(void)
1963 + task_t *prev = current, *next;
1964 + runqueue_t *rq = this_rq();
1965 + prio_array_t *array;
1968 + if (unlikely(prev->state != TASK_RUNNING)) {
1972 + release_kernel_lock(prev, smp_processor_id());
1973 + prev->sleep_timestamp = jiffies;
1975 - * Trick. sched_yield() first counts the number of truly
1976 - * 'pending' runnable processes, then returns if it's
1977 - * only the current processes. (This test does not have
1978 - * to be atomic.) In threaded applications this optimization
1979 - * gets triggered quite often.
1980 + * Decrease the yielding task's priority by one, to avoid
1981 + * livelocks. This priority loss is temporary, it's recovered
1982 + * once the current timeslice expires.
1984 + * If priority is already MAX_PRIO-1 then we still
1985 + * roundrobin the task within the runlist.
1987 + spin_lock_irq(&rq->lock);
1988 + array = current->array;
1990 + * If the task has reached maximum priority (or is a RT task)
1991 + * then just requeue the task to the end of the runqueue:
1993 + if (likely(current->prio == MAX_PRIO-1 || rt_task(current))) {
1994 + list_del(¤t->run_list);
1995 + list_add_tail(¤t->run_list, array->queue + current->prio);
1997 + list_del(¤t->run_list);
1998 + if (list_empty(array->queue + current->prio))
1999 + __clear_bit(current->prio, array->bitmap);
2001 + list_add_tail(¤t->run_list, array->queue + current->prio);
2002 + __set_bit(current->prio, array->bitmap);
2005 + * Context-switch manually. This is equivalent to
2006 + * calling schedule(), but faster, because yield()
2007 + * knows lots of things that can be optimized away
2008 + * from the generic scheduler path:
2010 + queue = array->queue + sched_find_first_bit(array->bitmap);
2011 + next = list_entry(queue->next, task_t, run_list);
2014 - int nr_pending = nr_running;
2019 - // Subtract non-idle processes running on other CPUs.
2020 - for (i = 0; i < smp_num_cpus; i++) {
2021 - int cpu = cpu_logical_map(i);
2022 - if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
2024 + prev->need_resched = 0;
2025 + if (likely(prev != next)) {
2026 + rq->nr_switches++;
2028 + context_switch(prev, next);
2033 - // on UP this process is on the runqueue as well
2038 - * This process can only be rescheduled by us,
2039 - * so this is safe without any locking.
2041 - if (current->policy == SCHED_OTHER)
2042 - current->policy |= SCHED_YIELD;
2043 - current->need_resched = 1;
2044 + spin_unlock_irq(&rq->lock);
2046 + reacquire_kernel_lock(current);
2048 - spin_lock_irq(&runqueue_lock);
2049 - move_last_runqueue(current);
2050 - spin_unlock_irq(&runqueue_lock);
2055 @@ -1105,7 +1324,7 @@
2056 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
2059 - struct task_struct *p;
2061 int retval = -EINVAL;
2064 @@ -1115,8 +1334,8 @@
2065 read_lock(&tasklist_lock);
2066 p = find_process_by_pid(pid);
2068 - jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice),
2070 + jiffies_to_timespec(p->policy & SCHED_FIFO ?
2071 + 0 : TASK_TIMESLICE(p), &t);
2072 read_unlock(&tasklist_lock);
2074 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
2075 @@ -1124,14 +1343,14 @@
2079 -static void show_task(struct task_struct * p)
2080 +static void show_task(task_t * p)
2082 unsigned long free = 0;
2084 static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
2086 printk("%-13.13s ", p->comm);
2087 - state = p->state ? ffz(~p->state) + 1 : 0;
2088 + state = p->state ? __ffs(p->state) + 1 : 0;
2089 if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
2090 printk(stat_nam[state]);
2092 @@ -1172,7 +1391,7 @@
2093 printk(" (NOTLB)\n");
2096 - extern void show_trace_task(struct task_struct *tsk);
2097 + extern void show_trace_task(task_t *tsk);
2101 @@ -1194,7 +1413,7 @@
2103 void show_state(void)
2105 - struct task_struct *p;
2108 #if (BITS_PER_LONG == 32)
2110 @@ -1217,121 +1436,88 @@
2111 read_unlock(&tasklist_lock);
2115 - * reparent_to_init() - Reparent the calling kernel thread to the init task.
2117 - * If a kernel thread is launched as a result of a system call, or if
2118 - * it ever exits, it should generally reparent itself to init so that
2119 - * it is correctly cleaned up on exit.
2121 - * The various task state such as scheduling policy and priority may have
2122 - * been inherited fro a user process, so we reset them to sane values here.
2124 - * NOTE that reparent_to_init() gives the caller full capabilities.
2126 -void reparent_to_init(void)
2127 +static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
2129 - struct task_struct *this_task = current;
2131 - write_lock_irq(&tasklist_lock);
2133 - /* Reparent to init */
2134 - REMOVE_LINKS(this_task);
2135 - this_task->p_pptr = child_reaper;
2136 - this_task->p_opptr = child_reaper;
2137 - SET_LINKS(this_task);
2139 - /* Set the exit signal to SIGCHLD so we signal init on exit */
2140 - this_task->exit_signal = SIGCHLD;
2142 - /* We also take the runqueue_lock while altering task fields
2143 - * which affect scheduling decisions */
2144 - spin_lock(&runqueue_lock);
2146 - this_task->ptrace = 0;
2147 - this_task->nice = DEF_NICE;
2148 - this_task->policy = SCHED_OTHER;
2149 - /* cpus_allowed? */
2150 - /* rt_priority? */
2152 - this_task->cap_effective = CAP_INIT_EFF_SET;
2153 - this_task->cap_inheritable = CAP_INIT_INH_SET;
2154 - this_task->cap_permitted = CAP_FULL_SET;
2155 - this_task->keep_capabilities = 0;
2156 - memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim)));
2157 - this_task->user = INIT_USER;
2159 - spin_unlock(&runqueue_lock);
2160 - write_unlock_irq(&tasklist_lock);
2162 + spin_lock(&rq1->lock);
2165 + spin_lock(&rq1->lock);
2166 + spin_lock(&rq2->lock);
2168 + spin_lock(&rq2->lock);
2169 + spin_lock(&rq1->lock);
2175 - * Put all the gunge required to become a kernel thread without
2176 - * attached user resources in one place where it belongs.
2179 -void daemonize(void)
2180 +static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
2182 - struct fs_struct *fs;
2186 - * If we were started as result of loading a module, close all of the
2187 - * user space pages. We don't need them, and if we didn't close them
2188 - * they would be locked into memory.
2192 - current->session = 1;
2193 - current->pgrp = 1;
2194 - current->tty = NULL;
2196 - /* Become as one with the init task */
2198 - exit_fs(current); /* current->fs->count--; */
2199 - fs = init_task.fs;
2201 - atomic_inc(&fs->count);
2202 - exit_files(current);
2203 - current->files = init_task.files;
2204 - atomic_inc(¤t->files->count);
2205 + spin_unlock(&rq1->lock);
2207 + spin_unlock(&rq2->lock);
2210 -extern unsigned long wait_init_idle;
2212 -void __init init_idle(void)
2213 +void __init init_idle(task_t *idle, int cpu)
2215 - struct schedule_data * sched_data;
2216 - sched_data = &aligned_data[smp_processor_id()].schedule_data;
2217 + runqueue_t *idle_rq = cpu_rq(cpu), *rq = idle->array->rq;
2218 + unsigned long flags;
2220 - if (current != &init_task && task_on_runqueue(current)) {
2221 - printk("UGH! (%d:%d) was on the runqueue, removing.\n",
2222 - smp_processor_id(), current->pid);
2223 - del_from_runqueue(current);
2225 - sched_data->curr = current;
2226 - sched_data->last_schedule = get_cycles();
2227 - clear_bit(current->processor, &wait_init_idle);
2228 + __save_flags(flags);
2230 + double_rq_lock(idle_rq, rq);
2232 + idle_rq->curr = idle_rq->idle = idle;
2233 + deactivate_task(idle, rq);
2234 + idle->array = NULL;
2235 + idle->prio = MAX_PRIO;
2236 + idle->state = TASK_RUNNING;
2238 + double_rq_unlock(idle_rq, rq);
2239 + idle->need_resched = 1;
2240 + __restore_flags(flags);
2243 -extern void init_timervecs (void);
2244 +extern void init_timervecs(void);
2245 +extern void timer_bh(void);
2246 +extern void tqueue_bh(void);
2247 +extern void immediate_bh(void);
2249 void __init sched_init(void)
2254 + for (i = 0; i < NR_CPUS; i++) {
2255 + runqueue_t *rq = cpu_rq(i);
2256 + prio_array_t *array;
2258 + rq->active = rq->arrays + 0;
2259 + rq->expired = rq->arrays + 1;
2260 + spin_lock_init(&rq->lock);
2262 + for (j = 0; j < 2; j++) {
2263 + array = rq->arrays + j;
2265 + array->lock = &rq->lock;
2266 + for (k = 0; k < MAX_PRIO; k++) {
2267 + INIT_LIST_HEAD(array->queue + k);
2268 + __clear_bit(k, array->bitmap);
2270 + // delimiter for bitsearch
2271 + __set_bit(MAX_PRIO, array->bitmap);
2275 * We have to do a little magic to get the first
2276 * process right in SMP mode.
2278 - int cpu = smp_processor_id();
2281 - init_task.processor = cpu;
2283 - for(nr = 0; nr < PIDHASH_SZ; nr++)
2284 - pidhash[nr] = NULL;
2286 + rq->curr = current;
2287 + rq->idle = current;
2288 + wake_up_process(current);
2292 init_bh(TIMER_BH, timer_bh);
2293 init_bh(TQUEUE_BH, tqueue_bh);
2294 init_bh(IMMEDIATE_BH, immediate_bh);
2295 @@ -1340,5 +1526,5 @@
2296 * The boot idle thread does lazy MMU switching as well:
2298 atomic_inc(&init_mm.mm_count);
2299 - enter_lazy_tlb(&init_mm, current, cpu);
2300 + enter_lazy_tlb(&init_mm, current, smp_processor_id());
2302 --- linux/kernel/exit.c.orig Tue Feb 5 13:51:53 2002
2303 +++ linux/kernel/exit.c Tue Feb 5 13:52:12 2002
2306 static void release_task(struct task_struct * p)
2308 - if (p != current) {
2313 - * Wait to make sure the process isn't on the
2314 - * runqueue (active on some other CPU still)
2318 - if (!task_has_cpu(p))
2324 - } while (task_has_cpu(p));
2327 + wait_task_inactive(p);
2329 - atomic_dec(&p->user->processes);
2330 - free_uid(p->user);
2331 - unhash_process(p);
2333 - release_thread(p);
2334 - current->cmin_flt += p->min_flt + p->cmin_flt;
2335 - current->cmaj_flt += p->maj_flt + p->cmaj_flt;
2336 - current->cnswap += p->nswap + p->cnswap;
2338 - * Potentially available timeslices are retrieved
2339 - * here - this way the parent does not get penalized
2340 - * for creating too many processes.
2342 - * (this cannot be used to artificially 'generate'
2343 - * timeslices, because any timeslice recovered here
2344 - * was given away by the parent in the first place.)
2346 - current->counter += p->counter;
2347 - if (current->counter >= MAX_COUNTER)
2348 - current->counter = MAX_COUNTER;
2350 - free_task_struct(p);
2352 - printk("task releasing itself\n");
2354 + atomic_dec(&p->user->processes);
2355 + free_uid(p->user);
2356 + unhash_process(p);
2358 + release_thread(p);
2359 + current->cmin_flt += p->min_flt + p->cmin_flt;
2360 + current->cmaj_flt += p->maj_flt + p->cmaj_flt;
2361 + current->cnswap += p->nswap + p->cnswap;
2364 + free_task_struct(p);
2368 @@ -147,6 +120,79 @@
2370 read_unlock(&tasklist_lock);
2375 + * reparent_to_init() - Reparent the calling kernel thread to the init task.
2377 + * If a kernel thread is launched as a result of a system call, or if
2378 + * it ever exits, it should generally reparent itself to init so that
2379 + * it is correctly cleaned up on exit.
2381 + * The various task state such as scheduling policy and priority may have
2382 + * been inherited from a user process, so we reset them to sane values here.
2384 + * NOTE that reparent_to_init() gives the caller full capabilities.
2386 +void reparent_to_init(void)
2388 + write_lock_irq(&tasklist_lock);
2390 + /* Reparent to init */
2391 + REMOVE_LINKS(current);
2392 + current->p_pptr = child_reaper;
2393 + current->p_opptr = child_reaper;
2394 + SET_LINKS(current);
2396 + /* Set the exit signal to SIGCHLD so we signal init on exit */
2397 + current->exit_signal = SIGCHLD;
2399 + current->ptrace = 0;
2400 + if ((current->policy == SCHED_OTHER) && (task_nice(current) < 0))
2401 + set_user_nice(current, 0);
2402 + /* cpus_allowed? */
2403 + /* rt_priority? */
2405 + current->cap_effective = CAP_INIT_EFF_SET;
2406 + current->cap_inheritable = CAP_INIT_INH_SET;
2407 + current->cap_permitted = CAP_FULL_SET;
2408 + current->keep_capabilities = 0;
2409 + memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
2410 + current->user = INIT_USER;
2412 + write_unlock_irq(&tasklist_lock);
2416 + * Put all the gunge required to become a kernel thread without
2417 + * attached user resources in one place where it belongs.
2420 +void daemonize(void)
2422 + struct fs_struct *fs;
2426 + * If we were started as result of loading a module, close all of the
2427 + * user space pages. We don't need them, and if we didn't close them
2428 + * they would be locked into memory.
2432 + current->session = 1;
2433 + current->pgrp = 1;
2434 + current->tty = NULL;
2436 + /* Become as one with the init task */
2438 + exit_fs(current); /* current->fs->count--; */
2439 + fs = init_task.fs;
2441 + atomic_inc(&fs->count);
2442 + exit_files(current);
2443 + current->files = init_task.files;
2444 + atomic_inc(¤t->files->count);
2448 --- linux/kernel/capability.c.orig Sat Jun 24 06:06:37 2000
2449 +++ linux/kernel/capability.c Tue Feb 5 13:52:12 2002
2451 #include <linux/mm.h>
2452 #include <asm/uaccess.h>
2454 +unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
2456 kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
2458 /* Note: never hold tasklist_lock while spinning for this one */
2459 --- linux/kernel/timer.c.orig Tue Feb 5 13:51:43 2002
2460 +++ linux/kernel/timer.c Tue Feb 5 13:52:12 2002
2463 #include <asm/uaccess.h>
2465 +struct kernel_stat kstat;
2468 * Timekeeping variables
2470 @@ -582,18 +584,7 @@
2471 int cpu = smp_processor_id(), system = user_tick ^ 1;
2473 update_one_process(p, user_tick, system, cpu);
2475 - if (--p->counter <= 0) {
2477 - p->need_resched = 1;
2480 - kstat.per_cpu_nice[cpu] += user_tick;
2482 - kstat.per_cpu_user[cpu] += user_tick;
2483 - kstat.per_cpu_system[cpu] += system;
2484 - } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
2485 - kstat.per_cpu_system[cpu] += system;
2486 + scheduler_tick(user_tick, system);
2490 @@ -794,6 +785,89 @@
2494 +static void process_timeout(unsigned long __data)
2496 + wake_up_process((task_t *)__data);
2500 + * schedule_timeout - sleep until timeout
2501 + * @timeout: timeout value in jiffies
2503 + * Make the current task sleep until @timeout jiffies have
2504 + * elapsed. The routine will return immediately unless
2505 + * the current task state has been set (see set_current_state()).
2507 + * You can set the task state as follows -
2509 + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
2510 + * pass before the routine returns. The routine will return 0
2512 + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
2513 + * delivered to the current task. In this case the remaining time
2514 + * in jiffies will be returned, or 0 if the timer expired in time
2516 + * The current task state is guaranteed to be TASK_RUNNING when this
2517 + * routine returns.
2519 + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
2520 + * the CPU away without a bound on the timeout. In this case the return
2521 + * value will be %MAX_SCHEDULE_TIMEOUT.
2523 + * In all cases the return value is guaranteed to be non-negative.
2525 +signed long schedule_timeout(signed long timeout)
2527 + struct timer_list timer;
2528 + unsigned long expire;
2532 + case MAX_SCHEDULE_TIMEOUT:
2534 + * These two special cases are useful to be comfortable
2535 + * in the caller. Nothing more. We could take
2536 + * MAX_SCHEDULE_TIMEOUT from one of the negative value
2537 + * but I' d like to return a valid offset (>=0) to allow
2538 + * the caller to do everything it want with the retval.
2544 + * Another bit of PARANOID. Note that the retval will be
2545 + * 0 since no piece of kernel is supposed to do a check
2546 + * for a negative retval of schedule_timeout() (since it
2547 + * should never happens anyway). You just have the printk()
2548 + * that will tell you if something is gone wrong and where.
2552 + printk(KERN_ERR "schedule_timeout: wrong timeout "
2553 + "value %lx from %p\n", timeout,
2554 + __builtin_return_address(0));
2555 + current->state = TASK_RUNNING;
2560 + expire = timeout + jiffies;
2562 + init_timer(&timer);
2563 + timer.expires = expire;
2564 + timer.data = (unsigned long) current;
2565 + timer.function = process_timeout;
2567 + add_timer(&timer);
2569 + del_timer_sync(&timer);
2571 + timeout = expire - jiffies;
2574 + return timeout < 0 ? 0 : timeout;
2577 /* Thread ID - the internal kernel "pid" */
2578 asmlinkage long sys_gettid(void)
2585 --- linux/kernel/fork.c.orig Tue Feb 5 13:51:53 2002
2586 +++ linux/kernel/fork.c Tue Feb 5 13:52:12 2002
2589 /* The idle threads do not count.. */
2594 unsigned long total_forks; /* Handle normal Linux uptimes. */
2597 struct task_struct *pidhash[PIDHASH_SZ];
2599 +rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */
2601 void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
2603 unsigned long flags;
2605 struct pt_regs *regs, unsigned long stack_size)
2608 + unsigned long flags;
2609 struct task_struct *p;
2610 struct completion vfork;
2613 copy_flags(clone_flags, p);
2614 p->pid = get_pid(clone_flags);
2616 - p->run_list.next = NULL;
2617 - p->run_list.prev = NULL;
2618 + INIT_LIST_HEAD(&p->run_list);
2621 init_waitqueue_head(&p->wait_chldexit);
2622 @@ -646,14 +647,15 @@
2626 - p->cpus_runnable = ~0UL;
2627 - p->processor = current->processor;
2629 /* ?? should we just memset this ?? */
2630 for(i = 0; i < smp_num_cpus; i++)
2631 - p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
2632 + p->per_cpu_utime[cpu_logical_map(i)] =
2633 + p->per_cpu_stime[cpu_logical_map(i)] = 0;
2634 spin_lock_init(&p->sigmask_lock);
2638 p->lock_depth = -1; /* -1 = no lock */
2639 p->start_time = jiffies;
2641 @@ -685,15 +687,27 @@
2642 p->pdeath_signal = 0;
2645 - * "share" dynamic priority between parent and child, thus the
2646 - * total amount of dynamic priorities in the system doesnt change,
2647 - * more scheduling fairness. This is only important in the first
2648 - * timeslice, on the long run the scheduling behaviour is unchanged.
2650 - p->counter = (current->counter + 1) >> 1;
2651 - current->counter >>= 1;
2652 - if (!current->counter)
2653 - current->need_resched = 1;
2654 + * Share the timeslice between parent and child, thus the
2655 + * total amount of pending timeslices in the system doesnt change,
2656 + * resulting in more scheduling fairness.
2658 + __save_flags(flags);
2660 + if (!current->time_slice)
2662 + p->time_slice = (current->time_slice + 1) >> 1;
2663 + current->time_slice >>= 1;
2664 + if (!current->time_slice) {
2666 + * This case is rare, it happens when the parent has only
2667 + * a single jiffy left from its timeslice. Taking the
2668 + * runqueue lock is not a problem.
2670 + current->time_slice = 1;
2671 + scheduler_tick(0,0);
2673 + p->sleep_timestamp = jiffies;
2674 + __restore_flags(flags);
2677 * Ok, add it to the run-queues and make it
2678 @@ -730,10 +744,23 @@
2679 if (p->ptrace & PT_PTRACED)
2680 send_sig(SIGSTOP, p, 1);
2682 +#define RUN_CHILD_FIRST 1
2683 +#if RUN_CHILD_FIRST
2684 + wake_up_forked_process(p); /* do this last */
2686 wake_up_process(p); /* do this last */
2689 if (clone_flags & CLONE_VFORK)
2690 wait_for_completion(&vfork);
2691 +#if RUN_CHILD_FIRST
2694 + * Let the child process run first, to avoid most of the
2695 + * COW overhead when the child exec()s afterwards.
2697 + current->need_resched = 1;
2702 --- linux/kernel/softirq.c.orig Tue Feb 5 13:51:47 2002
2703 +++ linux/kernel/softirq.c Tue Feb 5 13:52:12 2002
2704 @@ -259,10 +259,9 @@
2706 while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
2707 current->state = TASK_RUNNING;
2709 - current->policy |= SCHED_YIELD;
2711 - } while (test_bit(TASKLET_STATE_SCHED, &t->state));
2713 + sys_sched_yield();
2714 + while (test_bit(TASKLET_STATE_SCHED, &t->state));
2716 tasklet_unlock_wait(t);
2717 clear_bit(TASKLET_STATE_SCHED, &t->state);
2718 @@ -365,13 +364,13 @@
2719 int cpu = cpu_logical_map(bind_cpu);
2722 - current->nice = 19;
2723 + set_user_nice(current, 19);
2724 sigfillset(¤t->blocked);
2726 /* Migrate to the right CPU */
2727 - current->cpus_allowed = 1UL << cpu;
2728 - while (smp_processor_id() != cpu)
2730 + set_cpus_allowed(current, 1UL << cpu);
2734 sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu);
2740 -static __init int spawn_ksoftirqd(void)
2741 +__init int spawn_ksoftirqd(void)
2745 @@ -405,14 +404,12 @@
2746 CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0)
2747 printk("spawn_ksoftirqd() failed for cpu %d\n", cpu);
2749 - while (!ksoftirqd_task(cpu_logical_map(cpu))) {
2750 - current->policy |= SCHED_YIELD;
2753 + while (!ksoftirqd_task(cpu_logical_map(cpu)))
2754 + sys_sched_yield();
2761 -__initcall(spawn_ksoftirqd);
2762 +__initcall(spawn_ksoftirqd);
2763 --- linux/kernel/ptrace.c.orig Tue Feb 5 13:51:53 2002
2764 +++ linux/kernel/ptrace.c Tue Feb 5 13:52:12 2002
2766 if (child->state != TASK_STOPPED)
2769 - /* Make sure the child gets off its CPU.. */
2772 - if (!task_has_cpu(child))
2774 - task_unlock(child);
2776 - if (child->state != TASK_STOPPED)
2780 - } while (task_has_cpu(child));
2782 - task_unlock(child);
2783 + wait_task_inactive(child);
2787 --- linux/kernel/sys.c.orig Tue Feb 5 13:51:53 2002
2788 +++ linux/kernel/sys.c Tue Feb 5 13:52:12 2002
2789 @@ -220,10 +220,10 @@
2791 if (error == -ESRCH)
2793 - if (niceval < p->nice && !capable(CAP_SYS_NICE))
2794 + if (niceval < task_nice(p) && !capable(CAP_SYS_NICE))
2797 - p->nice = niceval;
2798 + set_user_nice(p, niceval);
2800 read_unlock(&tasklist_lock);
2804 if (!proc_sel(p, which, who))
2806 - niceval = 20 - p->nice;
2807 + niceval = 20 - task_nice(p);
2808 if (niceval > retval)
2811 --- linux/kernel/signal.c.orig Tue Feb 5 13:51:49 2002
2812 +++ linux/kernel/signal.c Tue Feb 5 13:52:12 2002
2813 @@ -478,12 +478,9 @@
2814 * process of changing - but no harm is done by that
2815 * other than doing an extra (lightweight) IPI interrupt.
2817 - spin_lock(&runqueue_lock);
2818 - if (task_has_cpu(t) && t->processor != smp_processor_id())
2819 - smp_send_reschedule(t->processor);
2820 - spin_unlock(&runqueue_lock);
2821 -#endif /* CONFIG_SMP */
2823 + if ((t->state == TASK_RUNNING) && (t->cpu != cpu()))
2824 + kick_if_running(t);
2826 if (t->state & TASK_INTERRUPTIBLE) {
2829 --- linux/kernel/printk.c.orig Tue Feb 5 13:51:53 2002
2830 +++ linux/kernel/printk.c Tue Feb 5 13:52:12 2002
2832 #include <linux/module.h>
2833 #include <linux/interrupt.h> /* For in_interrupt() */
2834 #include <linux/config.h>
2835 +#include <linux/delay.h>
2837 #include <asm/uaccess.h>
2839 --- linux/kernel/ksyms.c.orig Tue Feb 5 13:51:53 2002
2840 +++ linux/kernel/ksyms.c Tue Feb 5 13:52:12 2002
2842 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
2843 EXPORT_SYMBOL(schedule);
2844 EXPORT_SYMBOL(schedule_timeout);
2845 +EXPORT_SYMBOL(sys_sched_yield);
2846 +EXPORT_SYMBOL(set_user_nice);
2847 +EXPORT_SYMBOL(set_cpus_allowed);
2848 EXPORT_SYMBOL(jiffies);
2849 EXPORT_SYMBOL(xtime);
2850 EXPORT_SYMBOL(do_gettimeofday);
2853 EXPORT_SYMBOL(kstat);
2854 EXPORT_SYMBOL(nr_running);
2855 +EXPORT_SYMBOL(nr_context_switches);
2858 EXPORT_SYMBOL(panic);
2859 --- linux/mm/oom_kill.c.orig Tue Feb 5 13:51:47 2002
2860 +++ linux/mm/oom_kill.c Tue Feb 5 13:52:12 2002
2862 * Niced processes are most likely less important, so double
2863 * their badness points.
2866 + if (task_nice(p) > 0)
2871 * all the memory it needs. That way it should be able to
2872 * exit() and clear out its resources quickly...
2874 - p->counter = 5 * HZ;
2875 + p->time_slice = HZ;
2876 p->flags |= PF_MEMALLOC | PF_MEMDIE;
2878 /* This process has hardware access, be more careful. */
2880 * killing itself before someone else gets the chance to ask
2883 - current->policy |= SCHED_YIELD;
2889 --- linux/mm/page_alloc.c.orig Tue Feb 5 13:51:53 2002
2890 +++ linux/mm/page_alloc.c Tue Feb 5 13:52:12 2002
2894 /* Yield for kswapd, and try again */
2895 - current->policy |= SCHED_YIELD;
2896 __set_current_state(TASK_RUNNING);
2902 --- linux/mm/highmem.c.orig Tue Feb 5 13:51:51 2002
2903 +++ linux/mm/highmem.c Tue Feb 5 13:52:12 2002
2905 /* we need to wait I/O completion */
2906 run_task_queue(&tq_disk);
2908 - current->policy |= SCHED_YIELD;
2909 __set_current_state(TASK_RUNNING);
2916 /* we need to wait I/O completion */
2917 run_task_queue(&tq_disk);
2919 - current->policy |= SCHED_YIELD;
2920 __set_current_state(TASK_RUNNING);
2926 --- linux/include/linux/sched.h.orig Tue Feb 5 13:51:51 2002
2927 +++ linux/include/linux/sched.h Tue Feb 5 13:52:12 2002
2929 extern unsigned long event;
2931 #include <linux/config.h>
2932 +#include <linux/compiler.h>
2933 #include <linux/binfmts.h>
2934 #include <linux/threads.h>
2935 #include <linux/kernel.h>
2937 #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
2938 #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
2939 #define CLONE_THREAD 0x00010000 /* Same thread group? */
2940 +#define CLONE_NEWNS 0x00020000 /* New namespace group? */
2942 #define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD)
2945 #define CT_TO_SECS(x) ((x) / HZ)
2946 #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ)
2948 -extern int nr_running, nr_threads;
2949 +extern int nr_threads;
2950 extern int last_pid;
2951 +extern unsigned long nr_running(void);
2953 #include <linux/fs.h>
2954 #include <linux/time.h>
2955 @@ -116,12 +119,6 @@
2956 #define SCHED_FIFO 1
2960 - * This is an additional bit set when we want to
2961 - * yield the CPU for one re-schedule..
2963 -#define SCHED_YIELD 0x10
2965 struct sched_param {
2968 @@ -139,17 +136,22 @@
2971 extern rwlock_t tasklist_lock;
2972 -extern spinlock_t runqueue_lock;
2973 extern spinlock_t mmlist_lock;
2975 +typedef struct task_struct task_t;
2977 extern void sched_init(void);
2978 -extern void init_idle(void);
2979 +extern void init_idle(task_t *idle, int cpu);
2980 extern void show_state(void);
2981 extern void cpu_init (void);
2982 extern void trap_init(void);
2983 extern void update_process_times(int user);
2984 -extern void update_one_process(struct task_struct *p, unsigned long user,
2985 +extern void update_one_process(task_t *p, unsigned long user,
2986 unsigned long system, int cpu);
2987 +extern void scheduler_tick(int user_tick, int system);
2988 +extern void sched_task_migrated(task_t *p);
2989 +extern void smp_migrate_task(int cpu, task_t *task);
2990 +extern unsigned long cache_decay_ticks;
2992 #define MAX_SCHEDULE_TIMEOUT LONG_MAX
2993 extern signed long FASTCALL(schedule_timeout(signed long timeout));
2996 #define NR_OPEN_DEFAULT BITS_PER_LONG
3000 * Open file table structure
3003 extern struct user_struct root_user;
3004 #define INIT_USER (&root_user)
3006 +typedef struct prio_array prio_array_t;
3008 struct task_struct {
3010 * offsets of these are hardcoded elsewhere - touch with care
3011 @@ -295,35 +300,26 @@
3013 int lock_depth; /* Lock depth */
3016 - * offset 32 begins here on 32-bit platforms. We keep
3017 - * all fields in a single cacheline that are needed for
3018 - * the goodness() loop in schedule().
3022 - unsigned long policy;
3023 - struct mm_struct *mm;
3026 - * cpus_runnable is ~0 if the process is not running on any
3027 - * CPU. It's (1 << cpu) if it's running on a CPU. This mask
3028 - * is updated under the runqueue lock.
3030 - * To determine whether a process might run on a CPU, this
3031 - * mask is AND-ed with cpus_allowed.
3032 + * offset 32 begins here on 32-bit platforms.
3034 - unsigned long cpus_runnable, cpus_allowed;
3036 - * (only the 'next' pointer fits into the cacheline, but
3037 - * that's just fine.)
3039 - struct list_head run_list;
3040 - unsigned long sleep_time;
3042 + int prio, static_prio;
3044 + prio_array_t *array;
3046 + unsigned long sleep_avg;
3047 + unsigned long sleep_timestamp;
3049 + unsigned long policy;
3050 + unsigned long cpus_allowed;
3051 + unsigned int time_slice;
3053 + task_t *next_task, *prev_task;
3055 - struct task_struct *next_task, *prev_task;
3056 - struct mm_struct *active_mm;
3057 + struct mm_struct *mm, *active_mm;
3058 struct list_head local_pages;
3060 unsigned int allocation_order, nr_local_pages;
3063 @@ -345,12 +341,12 @@
3064 * older sibling, respectively. (p->father can be replaced with
3067 - struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
3068 + task_t *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
3069 struct list_head thread_group;
3071 /* PID hash table linkage. */
3072 - struct task_struct *pidhash_next;
3073 - struct task_struct **pidhash_pprev;
3074 + task_t *pidhash_next;
3075 + task_t **pidhash_pprev;
3077 wait_queue_head_t wait_chldexit; /* for wait4() */
3078 struct completion *vfork_done; /* for vfork() */
3080 struct fs_struct *fs;
3081 /* open file information */
3082 struct files_struct *files;
3084 + struct namespace *namespace;
3085 /* signal handlers */
3086 spinlock_t sigmask_lock; /* Protects signal and blocked */
3087 struct signal_struct *sig;
3088 @@ -446,10 +444,13 @@
3090 #define _STK_LIM (8*1024*1024)
3092 -#define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */
3093 -#define MAX_COUNTER (20*HZ/100)
3094 -#define DEF_NICE (0)
3095 +extern void set_cpus_allowed(task_t *p, unsigned long new_mask);
3096 +extern void set_user_nice(task_t *p, long nice);
3097 +extern int task_prio(task_t *p);
3098 +extern int task_nice(task_t *p);
3100 +asmlinkage long sys_sched_yield(void);
3101 +#define yield() sys_sched_yield()
3104 * The default (Linux) execution domain.
3105 @@ -468,14 +469,14 @@
3106 addr_limit: KERNEL_DS, \
3107 exec_domain: &default_exec_domain, \
3109 - counter: DEF_COUNTER, \
3112 + static_prio: 120, \
3113 policy: SCHED_OTHER, \
3114 + cpus_allowed: -1, \
3116 active_mm: &init_mm, \
3117 - cpus_runnable: -1, \
3118 - cpus_allowed: -1, \
3119 run_list: LIST_HEAD_INIT(tsk.run_list), \
3124 @@ -509,24 +510,24 @@
3128 - struct task_struct task;
3130 unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
3133 extern union task_union init_task_union;
3135 extern struct mm_struct init_mm;
3136 -extern struct task_struct *init_tasks[NR_CPUS];
3137 +extern task_t *init_tasks[NR_CPUS];
3139 /* PID hashing. (shouldnt this be dynamic?) */
3140 #define PIDHASH_SZ (4096 >> 2)
3141 -extern struct task_struct *pidhash[PIDHASH_SZ];
3142 +extern task_t *pidhash[PIDHASH_SZ];
3144 #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
3146 -static inline void hash_pid(struct task_struct *p)
3147 +static inline void hash_pid(task_t *p)
3149 - struct task_struct **htable = &pidhash[pid_hashfn(p->pid)];
3150 + task_t **htable = &pidhash[pid_hashfn(p->pid)];
3152 if((p->pidhash_next = *htable) != NULL)
3153 (*htable)->pidhash_pprev = &p->pidhash_next;
3154 @@ -534,16 +535,16 @@
3155 p->pidhash_pprev = htable;
3158 -static inline void unhash_pid(struct task_struct *p)
3159 +static inline void unhash_pid(task_t *p)
3162 p->pidhash_next->pidhash_pprev = p->pidhash_pprev;
3163 *p->pidhash_pprev = p->pidhash_next;
3166 -static inline struct task_struct *find_task_by_pid(int pid)
3167 +static inline task_t *find_task_by_pid(int pid)
3169 - struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)];
3170 + task_t *p, **htable = &pidhash[pid_hashfn(pid)];
3172 for(p = *htable; p && p->pid != pid; p = p->pidhash_next)
3174 @@ -551,19 +552,6 @@
3178 -#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL)
3180 -static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu)
3182 - tsk->processor = cpu;
3183 - tsk->cpus_runnable = 1UL << cpu;
3186 -static inline void task_release_cpu(struct task_struct *tsk)
3188 - tsk->cpus_runnable = ~0UL;
3191 /* per-UID process charging. */
3192 extern struct user_struct * alloc_uid(uid_t);
3193 extern void free_uid(struct user_struct *);
3195 extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
3196 extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
3197 signed long timeout));
3198 -extern int FASTCALL(wake_up_process(struct task_struct * tsk));
3199 +extern int FASTCALL(wake_up_process(task_t * tsk));
3200 +extern void FASTCALL(wake_up_forked_process(task_t * tsk));
3201 +extern void FASTCALL(sched_exit(task_t * p));
3203 #define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
3204 #define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
3205 @@ -608,28 +598,28 @@
3206 extern int in_egroup_p(gid_t);
3208 extern void proc_caches_init(void);
3209 -extern void flush_signals(struct task_struct *);
3210 -extern void flush_signal_handlers(struct task_struct *);
3211 +extern void flush_signals(task_t *);
3212 +extern void flush_signal_handlers(task_t *);
3213 extern int dequeue_signal(sigset_t *, siginfo_t *);
3214 extern void block_all_signals(int (*notifier)(void *priv), void *priv,
3216 extern void unblock_all_signals(void);
3217 -extern int send_sig_info(int, struct siginfo *, struct task_struct *);
3218 -extern int force_sig_info(int, struct siginfo *, struct task_struct *);
3219 +extern int send_sig_info(int, struct siginfo *, task_t *);
3220 +extern int force_sig_info(int, struct siginfo *, task_t *);
3221 extern int kill_pg_info(int, struct siginfo *, pid_t);
3222 extern int kill_sl_info(int, struct siginfo *, pid_t);
3223 extern int kill_proc_info(int, struct siginfo *, pid_t);
3224 -extern void notify_parent(struct task_struct *, int);
3225 -extern void do_notify_parent(struct task_struct *, int);
3226 -extern void force_sig(int, struct task_struct *);
3227 -extern int send_sig(int, struct task_struct *, int);
3228 +extern void notify_parent(task_t *, int);
3229 +extern void do_notify_parent(task_t *, int);
3230 +extern void force_sig(int, task_t *);
3231 +extern int send_sig(int, task_t *, int);
3232 extern int kill_pg(pid_t, int, int);
3233 extern int kill_sl(pid_t, int, int);
3234 extern int kill_proc(pid_t, int, int);
3235 extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *);
3236 extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long);
3238 -static inline int signal_pending(struct task_struct *p)
3239 +static inline int signal_pending(task_t *p)
3241 return (p->sigpending != 0);
3244 This is required every time the blocked sigset_t changes.
3245 All callers should have t->sigmask_lock. */
3247 -static inline void recalc_sigpending(struct task_struct *t)
3248 +static inline void recalc_sigpending(task_t *t)
3250 t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
3252 @@ -775,16 +765,17 @@
3253 extern int expand_fdset(struct files_struct *, int nr);
3254 extern void free_fdset(fd_set *, int);
3256 -extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
3257 +extern int copy_thread(int, unsigned long, unsigned long, unsigned long, task_t *, struct pt_regs *);
3258 extern void flush_thread(void);
3259 extern void exit_thread(void);
3261 -extern void exit_mm(struct task_struct *);
3262 -extern void exit_files(struct task_struct *);
3263 -extern void exit_sighand(struct task_struct *);
3264 +extern void exit_mm(task_t *);
3265 +extern void exit_files(task_t *);
3266 +extern void exit_sighand(task_t *);
3268 extern void reparent_to_init(void);
3269 extern void daemonize(void);
3270 +extern task_t *child_reaper;
3272 extern int do_execve(char *, char **, char **, struct pt_regs *);
3273 extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long);
3275 extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
3276 extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
3278 +extern void wait_task_inactive(task_t * p);
3279 +extern void kick_if_running(task_t * p);
3281 #define __wait_event(wq, condition) \
3283 wait_queue_t __wait; \
3284 @@ -871,24 +865,10 @@
3285 for (p = &init_task ; (p = p->next_task) != &init_task ; )
3287 #define next_thread(p) \
3288 - list_entry((p)->thread_group.next, struct task_struct, thread_group)
3290 -static inline void del_from_runqueue(struct task_struct * p)
3293 - p->sleep_time = jiffies;
3294 - list_del(&p->run_list);
3295 - p->run_list.next = NULL;
3298 -static inline int task_on_runqueue(struct task_struct *p)
3300 - return (p->run_list.next != NULL);
3302 + list_entry((p)->thread_group.next, task_t, thread_group)
3304 -static inline void unhash_process(struct task_struct *p)
3305 +static inline void unhash_process(task_t *p)
3307 - if (task_on_runqueue(p)) BUG();
3308 write_lock_irq(&tasklist_lock);
3311 @@ -898,12 +878,12 @@
3314 /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */
3315 -static inline void task_lock(struct task_struct *p)
3316 +static inline void task_lock(task_t *p)
3318 spin_lock(&p->alloc_lock);
3321 -static inline void task_unlock(struct task_struct *p)
3322 +static inline void task_unlock(task_t *p)
3324 spin_unlock(&p->alloc_lock);
3326 --- linux/include/linux/list.h.orig Tue Feb 5 13:51:51 2002
3327 +++ linux/include/linux/list.h Tue Feb 5 13:52:12 2002
3329 struct list_head *next, *prev;
3332 +typedef struct list_head list_t;
3334 #define LIST_HEAD_INIT(name) { &(name), &(name) }
3336 #define LIST_HEAD(name) \
3337 --- linux/include/linux/kernel_stat.h.orig Tue Aug 21 14:26:23 2001
3338 +++ linux/include/linux/kernel_stat.h Tue Feb 5 13:52:12 2002
3340 unsigned int ipackets, opackets;
3341 unsigned int ierrors, oerrors;
3342 unsigned int collisions;
3343 - unsigned int context_swtch;
3346 extern struct kernel_stat kstat;
3348 +extern unsigned long nr_context_switches(void);
3350 #if !defined(CONFIG_ARCH_S390)
3352 --- linux/include/linux/smp.h.orig Sun Dec 31 20:10:17 2000
3353 +++ linux/include/linux/smp.h Tue Feb 5 13:52:12 2002
3355 #define cpu_number_map(cpu) 0
3356 #define smp_call_function(func,info,retry,wait) ({ 0; })
3357 #define cpu_online_map 1
3358 +static inline void smp_send_reschedule(int cpu) { }
3359 +static inline void smp_send_reschedule_all(void) { }
3364 + * Common definitions:
3366 +#define cpu() smp_processor_id()
3369 --- linux/include/asm-i386/smp.h.orig Tue Feb 5 13:51:51 2002
3370 +++ linux/include/asm-i386/smp.h Tue Feb 5 13:52:12 2002
3372 extern void smp_flush_tlb(void);
3373 extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
3374 extern void smp_send_reschedule(int cpu);
3375 +extern void smp_send_reschedule_all(void);
3376 extern void smp_invalidate_rcv(void); /* Process an NMI */
3377 extern void (*mtrr_hook) (void);
3378 extern void zap_low_mappings (void);
3380 * so this is correct in the x86 case.
3383 -#define smp_processor_id() (current->processor)
3384 +#define smp_processor_id() (current->cpu)
3386 static __inline int hard_smp_processor_id(void)
3388 @@ -121,18 +122,6 @@
3389 #endif /* !__ASSEMBLY__ */
3391 #define NO_PROC_ID 0xFF /* No processor magic marker */
3394 - * This magic constant controls our willingness to transfer
3395 - * a process across CPUs. Such a transfer incurs misses on the L1
3396 - * cache, and on a P6 or P5 with multiple L2 caches L2 hits. My
3397 - * gut feeling is this will vary by board in value. For a board
3398 - * with separate L2 cache it probably depends also on the RSS, and
3399 - * for a board with shared L2 cache it ought to decay fast as other
3400 - * processes are run.
3403 -#define PROC_CHANGE_PENALTY 15 /* Schedule penalty */
3407 --- linux/include/asm-i386/bitops.h.orig Tue Aug 21 14:26:16 2001
3408 +++ linux/include/asm-i386/bitops.h Tue Feb 5 13:52:12 2002
3414 +static __inline__ void __clear_bit(int nr, volatile void * addr)
3416 + __asm__ __volatile__(
3421 #define smp_mb__before_clear_bit() barrier()
3422 #define smp_mb__after_clear_bit() barrier()
3424 @@ -284,6 +292,34 @@
3428 + * find_first_bit - find the first set bit in a memory region
3429 + * @addr: The address to start the search at
3430 + * @size: The maximum size to search
3432 + * Returns the bit-number of the first set bit, not the number of the byte
3433 + * containing a bit.
3435 +static __inline__ int find_first_bit(void * addr, unsigned size)
3440 + /* This looks at memory. Mark it volatile to tell gcc not to move it around */
3441 + __asm__ __volatile__(
3442 + "xorl %%eax,%%eax\n\t"
3445 + "leal -4(%%edi),%%edi\n\t"
3446 + "bsfl (%%edi),%%eax\n"
3447 + "1:\tsubl %%ebx,%%edi\n\t"
3448 + "shll $3,%%edi\n\t"
3449 + "addl %%edi,%%eax"
3450 + :"=a" (res), "=&c" (d0), "=&D" (d1)
3451 + :"1" ((size + 31) >> 5), "2" (addr), "b" (addr));
3456 * find_next_zero_bit - find the first zero bit in a memory region
3457 * @addr: The address to base the search on
3458 * @offset: The bitnumber to start searching at
3463 - * Look for zero in first byte
3464 + * Look for zero in the first 32 bits.
3466 __asm__("bsfl %1,%0\n\t"
3468 @@ -317,6 +353,39 @@
3472 + * find_next_bit - find the first set bit in a memory region
3473 + * @addr: The address to base the search on
3474 + * @offset: The bitnumber to start searching at
3475 + * @size: The maximum size to search
3477 +static __inline__ int find_next_bit (void * addr, int size, int offset)
3479 + unsigned long * p = ((unsigned long *) addr) + (offset >> 5);
3480 + int set = 0, bit = offset & 31, res;
3484 + * Look for nonzero in the first 32 bits:
3486 + __asm__("bsfl %1,%0\n\t"
3491 + : "r" (*p >> bit));
3492 + if (set < (32 - bit))
3493 + return set + offset;
3498 + * No set bit yet, search remaining full words for a bit
3500 + res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr));
3501 + return (offset + set + res);
3505 * ffz - find first zero in word.
3506 * @word: The word to search
3508 @@ -327,6 +396,20 @@
3509 __asm__("bsfl %1,%0"
3516 + * __ffs - find first bit in word.
3517 + * @word: The word to search
3519 + * Undefined if no bit exists, so code should check against 0 first.
3521 +static __inline__ unsigned long __ffs(unsigned long word)
3523 + __asm__("bsfl %1,%0"
3529 --- linux/include/asm-i386/pgalloc.h.orig Tue Feb 5 13:51:51 2002
3530 +++ linux/include/asm-i386/pgalloc.h Tue Feb 5 13:52:12 2002
3533 struct mm_struct *active_mm;
3535 + char __cacheline_padding[24];
3537 extern struct tlb_state cpu_tlbstate[NR_CPUS];
3539 --- linux/include/asm-i386/mmu_context.h.orig Tue Aug 21 14:26:23 2001
3540 +++ linux/include/asm-i386/mmu_context.h Tue Feb 5 13:52:12 2002
3542 #include <asm/pgalloc.h>
3545 + * Every architecture must define this function. It's the fastest
3546 + * way of searching a 140-bit bitmap where the first 100 bits are
3547 + * unlikely to be set. It's guaranteed that at least one of the 140
3548 + * bits is cleared.
3550 +static inline int sched_find_first_bit(unsigned long *b)
3552 + if (unlikely(b[0]))
3553 + return __ffs(b[0]);
3554 + if (unlikely(b[1]))
3555 + return __ffs(b[1]) + 32;
3556 + if (unlikely(b[2]))
3557 + return __ffs(b[2]) + 64;
3559 + return __ffs(b[3]) + 96;
3560 + return __ffs(b[4]) + 128;
3564 * possibly do the LDT unload here?
3566 #define destroy_context(mm) do { } while(0)
3569 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu)
3571 - if (prev != next) {
3572 + if (likely(prev != next)) {
3573 /* stop flush ipis for the previous mm */
3574 clear_bit(cpu, &prev->cpu_vm_mask);
3576 * Re-load LDT if necessary
3578 - if (prev->context.segments != next->context.segments)
3579 + if (unlikely(prev->context.segments != next->context.segments))
3582 cpu_tlbstate[cpu].state = TLBSTATE_OK;
3583 --- linux/include/asm-i386/hw_irq.h.orig Tue Feb 5 13:51:40 2002
3584 +++ linux/include/asm-i386/hw_irq.h Tue Feb 5 13:52:12 2002
3586 #define ERROR_APIC_VECTOR 0xfe
3587 #define INVALIDATE_TLB_VECTOR 0xfd
3588 #define RESCHEDULE_VECTOR 0xfc
3589 -#define CALL_FUNCTION_VECTOR 0xfb
3590 +#define TASK_MIGRATION_VECTOR 0xfb
3591 +#define CALL_FUNCTION_VECTOR 0xfa
3594 * Local APIC timer IRQ vector is on a different priority level,
3595 --- linux/include/asm-i386/apic.h.orig Tue Feb 5 13:51:43 2002
3596 +++ linux/include/asm-i386/apic.h Tue Feb 5 13:52:12 2002
3598 extern void setup_apic_nmi_watchdog (void);
3599 extern inline void nmi_watchdog_tick (struct pt_regs * regs);
3600 extern int APIC_init_uniprocessor (void);
3601 +extern void disable_APIC_timer(void);
3602 +extern void enable_APIC_timer(void);
3604 extern struct pm_dev *apic_pm_register(pm_dev_t, unsigned long, pm_callback);
3605 extern void apic_pm_unregister(struct pm_dev*);
3606 --- linux/net/unix/af_unix.c.orig Tue Feb 5 13:51:53 2002
3607 +++ linux/net/unix/af_unix.c Tue Feb 5 13:52:12 2002
3608 @@ -565,10 +565,8 @@
3610 write_unlock(&unix_table_lock);
3611 /* Sanity yield. It is unusual case, but yet... */
3612 - if (!(ordernum&0xFF)) {
3613 - current->policy |= SCHED_YIELD;
3616 + if (!(ordernum&0xFF))
3620 addr->hash ^= sk->type;
3621 --- linux/net/ipv4/tcp_output.c.orig Tue Feb 5 13:51:51 2002
3622 +++ linux/net/ipv4/tcp_output.c Tue Feb 5 13:52:12 2002
3623 @@ -1009,8 +1009,7 @@
3624 skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
3627 - current->policy |= SCHED_YIELD;
3632 /* Reserve space for headers and prepare control bits. */
3633 --- linux/net/sunrpc/sched.c.orig Tue Feb 5 13:51:53 2002
3634 +++ linux/net/sunrpc/sched.c Tue Feb 5 13:52:12 2002
3637 if (flags & RPC_TASK_ASYNC)
3639 - current->policy |= SCHED_YIELD;
3642 } while (!signalled());
3645 @@ -1115,8 +1114,7 @@
3648 dprintk("rpciod_killall: waiting for tasks to exit\n");
3649 - current->policy |= SCHED_YIELD;
3655 @@ -1186,8 +1184,7 @@
3656 * wait briefly before checking the process id.
3658 current->sigpending = 0;
3659 - current->policy |= SCHED_YIELD;
3663 * Display a message if we're going to wait longer.
3665 --- linux/net/sched/sch_generic.c.orig Fri Aug 18 19:26:25 2000
3666 +++ linux/net/sched/sch_generic.c Tue Feb 5 13:52:12 2002
3667 @@ -475,10 +475,8 @@
3669 dev_watchdog_down(dev);
3671 - while (test_bit(__LINK_STATE_SCHED, &dev->state)) {
3672 - current->policy |= SCHED_YIELD;
3675 + while (test_bit(__LINK_STATE_SCHED, &dev->state))
3678 spin_unlock_wait(&dev->xmit_lock);
3680 --- linux/net/socket.c.orig Tue Feb 5 13:51:51 2002
3681 +++ linux/net/socket.c Tue Feb 5 13:52:12 2002
3683 while (atomic_read(&net_family_lockct) != 0) {
3684 spin_unlock(&net_family_lock);
3686 - current->policy |= SCHED_YIELD;
3690 spin_lock(&net_family_lock);
3692 --- linux/drivers/net/slip.c.orig Tue Feb 5 13:51:52 2002
3693 +++ linux/drivers/net/slip.c Tue Feb 5 13:52:12 2002
3694 @@ -1393,10 +1393,8 @@
3695 /* First of all: check for active disciplines and hangup them.
3699 - current->counter = 0;
3703 + sys_sched_yield();
3707 --- linux/drivers/block/loop.c.orig Tue Feb 5 13:51:50 2002
3708 +++ linux/drivers/block/loop.c Tue Feb 5 13:52:12 2002
3710 flush_signals(current);
3711 spin_unlock_irq(¤t->sigmask_lock);
3713 - current->policy = SCHED_OTHER;
3714 - current->nice = -20;
3716 spin_lock_irq(&lo->lo_lock);
3717 lo->lo_state = Lo_bound;
3718 atomic_inc(&lo->lo_pending);
3719 --- linux/drivers/char/mwave/mwavedd.c.orig Tue Feb 5 13:51:44 2002
3720 +++ linux/drivers/char/mwave/mwavedd.c Tue Feb 5 13:52:12 2002
3722 pDrvData->IPCs[ipcnum].bIsHere = FALSE;
3723 pDrvData->IPCs[ipcnum].bIsEnabled = TRUE;
3724 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
3725 - current->nice = -20; /* boost to provide priority timing */
3727 current->priority = 0x28; /* boost to provide priority timing */
3729 --- linux/drivers/char/drm-4.0/ffb_drv.c.orig Tue Feb 5 13:51:51 2002
3730 +++ linux/drivers/char/drm-4.0/ffb_drv.c Tue Feb 5 13:52:12 2002
3733 atomic_inc(&dev->total_sleeps);
3734 current->state = TASK_INTERRUPTIBLE;
3735 - current->policy |= SCHED_YIELD;
3738 if (signal_pending(current)) {
3741 --- linux/drivers/char/drm-4.0/tdfx_drv.c.orig Tue Feb 5 13:51:52 2002
3742 +++ linux/drivers/char/drm-4.0/tdfx_drv.c Tue Feb 5 13:52:12 2002
3744 lock.context, current->pid, j,
3745 dev->lock.lock_time, jiffies);
3746 current->state = TASK_INTERRUPTIBLE;
3747 - current->policy |= SCHED_YIELD;
3748 schedule_timeout(DRM_LOCK_SLICE-j);
3749 DRM_DEBUG("jiffies=%d\n", jiffies);
3751 @@ -578,10 +577,7 @@
3754 atomic_inc(&dev->total_sleeps);
3756 - current->policy |= SCHED_YIELD;
3760 if (signal_pending(current)) {
3764 when dev->last_context == lock.context
3765 NOTE WE HOLD THE LOCK THROUGHOUT THIS
3767 - current->policy |= SCHED_YIELD;
3770 current->state = TASK_RUNNING;
3771 remove_wait_queue(&dev->context_wait, &entry);
3772 if (signal_pending(current)) {
3773 --- linux/drivers/ide/ataraid.c.orig Tue Feb 5 13:51:46 2002
3774 +++ linux/drivers/ide/ataraid.c Tue Feb 5 13:52:12 2002
3776 ptr=kmalloc(sizeof(struct buffer_head),GFP_NOIO);
3778 __set_current_state(TASK_RUNNING);
3779 - current->policy |= SCHED_YIELD;
3786 ptr=kmalloc(sizeof(struct ataraid_bh_private),GFP_NOIO);
3788 __set_current_state(TASK_RUNNING);
3789 - current->policy |= SCHED_YIELD;
3795 --- linux/drivers/md/md.c.orig Tue Feb 5 13:51:52 2002
3796 +++ linux/drivers/md/md.c Tue Feb 5 13:52:12 2002
3797 @@ -2936,8 +2936,6 @@
3798 * bdflush, otherwise bdflush will deadlock if there are too
3799 * many dirty RAID5 blocks.
3801 - current->policy = SCHED_OTHER;
3802 - current->nice = -20;
3805 complete(thread->event);
3806 @@ -3387,11 +3385,6 @@
3807 "(but not more than %d KB/sec) for reconstruction.\n",
3808 sysctl_speed_limit_max);
3811 - * Resync has low priority.
3813 - current->nice = 19;
3815 is_mddev_idle(mddev); /* this also initializes IO event counters */
3816 for (m = 0; m < SYNC_MARKS; m++) {
3818 @@ -3469,16 +3462,13 @@
3819 currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
3821 if (currspeed > sysctl_speed_limit_min) {
3822 - current->nice = 19;
3824 if ((currspeed > sysctl_speed_limit_max) ||
3825 !is_mddev_idle(mddev)) {
3826 current->state = TASK_INTERRUPTIBLE;
3827 md_schedule_timeout(HZ/4);
3831 - current->nice = -20;
3834 printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
3836 --- linux/arch/i386/mm/fault.c.orig Tue Feb 5 13:51:51 2002
3837 +++ linux/arch/i386/mm/fault.c Tue Feb 5 13:52:12 2002
3841 if (current->pid == 1) {
3842 - current->policy |= SCHED_YIELD;
3850 up_read(&mm->mmap_sem);
3851 if (tsk->pid == 1) {
3852 - tsk->policy |= SCHED_YIELD;
3855 down_read(&mm->mmap_sem);
3858 --- linux/arch/i386/kernel/smpboot.c.orig Tue Feb 5 13:51:49 2002
3859 +++ linux/arch/i386/kernel/smpboot.c Tue Feb 5 13:52:12 2002
3860 @@ -308,14 +308,14 @@
3861 if (tsc_values[i] < avg)
3862 realdelta = -realdelta;
3864 - printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n",
3866 + printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", i, realdelta);
3872 printk("passed.\n");
3876 static void __init synchronize_tsc_ap (void)
3878 * (This works even if the APIC is not enabled.)
3880 phys_id = GET_APIC_ID(apic_read(APIC_ID));
3881 - cpuid = current->processor;
3883 if (test_and_set_bit(cpuid, &cpu_online_map)) {
3884 printk("huh, phys CPU#%d, CPU#%d already present??\n",
3888 smp_store_cpu_info(cpuid);
3890 + disable_APIC_timer();
3892 * Allow the master to continue.
3896 while (!atomic_read(&smp_commenced))
3898 + enable_APIC_timer();
3900 * low-memory mappings have been cleared, flush them from
3901 * the local TLBs too.
3902 @@ -803,16 +805,13 @@
3904 panic("No idle process for CPU %d", cpu);
3906 - idle->processor = cpu;
3907 - idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */
3908 + init_idle(idle, cpu);
3910 map_cpu_to_boot_apicid(cpu, apicid);
3912 idle->thread.eip = (unsigned long) start_secondary;
3914 - del_from_runqueue(idle);
3915 unhash_process(idle);
3916 - init_tasks[cpu] = idle;
3918 /* start_eip had better be page-aligned! */
3919 start_eip = setup_trampoline();
3923 cycles_t cacheflush_time;
3924 +unsigned long cache_decay_ticks;
3926 static void smp_tune_scheduling (void)
3928 @@ -958,9 +958,13 @@
3929 cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth;
3932 + cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000;
3934 printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
3935 (long)cacheflush_time/(cpu_khz/1000),
3936 ((long)cacheflush_time*100/(cpu_khz/1000)) % 100);
3937 + printk("task migration cache decay timeout: %ld msecs.\n",
3938 + (cache_decay_ticks + 1) * 1000 / HZ);
3942 @@ -1020,8 +1024,7 @@
3943 map_cpu_to_boot_apicid(0, boot_cpu_apicid);
3945 global_irq_holder = 0;
3946 - current->processor = 0;
3949 smp_tune_scheduling();
3952 --- linux/arch/i386/kernel/process.c.orig Tue Feb 5 13:51:51 2002
3953 +++ linux/arch/i386/kernel/process.c Tue Feb 5 13:52:12 2002
3954 @@ -123,15 +123,12 @@
3955 void cpu_idle (void)
3957 /* endless idle loop with no priority at all */
3959 - current->nice = 20;
3960 - current->counter = -100;
3963 void (*idle)(void) = pm_idle;
3965 idle = default_idle;
3966 - while (!current->need_resched)
3967 + if (!current->need_resched)
3971 @@ -694,15 +691,17 @@
3972 asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
3975 - * Restore %fs and %gs.
3976 + * Restore %fs and %gs if needed.
3978 - loadsegment(fs, next->fs);
3979 - loadsegment(gs, next->gs);
3980 + if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) {
3981 + loadsegment(fs, next->fs);
3982 + loadsegment(gs, next->gs);
3986 * Now maybe reload the debug registers
3988 - if (next->debugreg[7]){
3989 + if (unlikely(next->debugreg[7])) {
3997 - if (prev->ioperm || next->ioperm) {
3998 + if (unlikely(prev->ioperm || next->ioperm)) {
4001 * 4 cachelines copy ... not good, but not that
4002 --- linux/arch/i386/kernel/apic.c.orig Tue Feb 5 13:51:51 2002
4003 +++ linux/arch/i386/kernel/apic.c Tue Feb 5 13:52:12 2002
4007 slice = clocks / (smp_num_cpus+1);
4008 - printk("cpu: %d, clocks: %d, slice: %d\n",
4009 - smp_processor_id(), clocks, slice);
4010 + printk("cpu: %d, clocks: %d, slice: %d\n", smp_processor_id(), clocks, slice);
4013 * Wait for IRQ0's slice:
4016 __setup_APIC_LVTT(clocks);
4018 - printk("CPU%d<T0:%d,T1:%d,D:%d,S:%d,C:%d>\n",
4019 - smp_processor_id(), t0, t1, delta, slice, clocks);
4020 + printk("CPU%d<T0:%d,T1:%d,D:%d,S:%d,C:%d>\n", smp_processor_id(), t0, t1, delta, slice, clocks);
4022 __restore_flags(flags);
4024 @@ -922,6 +920,26 @@
4026 /* and update all other cpus */
4027 smp_call_function(setup_APIC_timer, (void *)calibration_result, 1, 1);
4030 +void __init disable_APIC_timer(void)
4032 + if (using_apic_timer) {
4035 + v = apic_read(APIC_LVTT);
4036 + apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
4040 +void enable_APIC_timer(void)
4042 + if (using_apic_timer) {
4045 + v = apic_read(APIC_LVTT);
4046 + apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
4051 --- linux/arch/i386/kernel/nmi.c.orig Tue Feb 5 13:51:36 2002
4052 +++ linux/arch/i386/kernel/nmi.c Tue Feb 5 13:52:12 2002
4054 * to get a message out.
4057 - printk("NMI Watchdog detected LOCKUP on CPU%d, registers:\n", cpu);
4058 + printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip);
4059 show_registers(regs);
4060 printk("console shuts up ...\n");
4062 --- linux/arch/i386/kernel/smp.c.orig Tue Feb 5 13:51:49 2002
4063 +++ linux/arch/i386/kernel/smp.c Tue Feb 5 13:52:12 2002
4065 /* The 'big kernel lock' */
4066 spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
4068 -struct tlb_state cpu_tlbstate[NR_CPUS] = {[0 ... NR_CPUS-1] = { &init_mm, 0 }};
4069 +struct tlb_state cpu_tlbstate[NR_CPUS] __cacheline_aligned = {[0 ... NR_CPUS-1] = { &init_mm, 0, }};
4072 * the following functions deal with sending IPIs between CPUs.
4073 @@ -485,15 +485,54 @@
4074 do_flush_tlb_all_local();
4077 +static spinlock_t migration_lock = SPIN_LOCK_UNLOCKED;
4078 +static task_t *new_task;
4081 + * This function sends a 'task migration' IPI to another CPU.
4082 + * Must be called from syscall contexts, with interrupts *enabled*.
4084 +void smp_migrate_task(int cpu, task_t *p)
4087 + * The target CPU will unlock the migration spinlock:
4089 + spin_lock(&migration_lock);
4091 + send_IPI_mask(1 << cpu, TASK_MIGRATION_VECTOR);
4095 + * Task migration callback.
4097 +asmlinkage void smp_task_migration_interrupt(void)
4103 + spin_unlock(&migration_lock);
4104 + sched_task_migrated(p);
4107 * this function sends a 'reschedule' IPI to another CPU.
4108 * it goes straight through and wastes no time serializing
4109 * anything. Worst case is that we lose a reschedule ...
4112 void smp_send_reschedule(int cpu)
4114 send_IPI_mask(1 << cpu, RESCHEDULE_VECTOR);
4118 + * this function sends a reschedule IPI to all (other) CPUs.
4119 + * This should only be used if some 'global' task became runnable,
4120 + * such as a RT task, that must be handled now. The first CPU
4121 + * that manages to grab the task will run it.
4123 +void smp_send_reschedule_all(void)
4125 + send_IPI_allbutself(RESCHEDULE_VECTOR);
4129 --- linux/arch/i386/kernel/i8259.c.orig Tue Feb 5 13:51:36 2002
4130 +++ linux/arch/i386/kernel/i8259.c Tue Feb 5 13:52:12 2002
4132 * through the ICC by us (IPIs)
4135 +BUILD_SMP_INTERRUPT(task_migration_interrupt,TASK_MIGRATION_VECTOR)
4136 BUILD_SMP_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
4137 BUILD_SMP_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
4138 BUILD_SMP_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
4140 * IPI, driven by wakeup.
4142 set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
4144 + /* IPI for task migration */
4145 + set_intr_gate(TASK_MIGRATION_VECTOR, task_migration_interrupt);
4147 /* IPI for invalidation */
4148 set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
4149 --- linux/arch/i386/kernel/entry.S.orig Tue Feb 5 13:51:51 2002
4150 +++ linux/arch/i386/kernel/entry.S Tue Feb 5 13:52:12 2002
4160 @@ -176,9 +176,11 @@
4163 ENTRY(ret_from_fork)
4166 call SYMBOL_NAME(schedule_tail)
4170 testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS
4172 --- linux/arch/i386/kernel/setup.c.orig Tue Feb 5 13:51:51 2002
4173 +++ linux/arch/i386/kernel/setup.c Tue Feb 5 13:52:12 2002
4174 @@ -2924,9 +2924,10 @@
4179 - * Clear all 6 debug registers:
4181 + /* Clear %fs and %gs. */
4182 + asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
4184 + /* Clear all 6 debug registers: */
4186 #define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) );