9bca5db7bffa01875d1c471b2e6419e2 sched-O1-2.4.18-pre8-K3.patch

author cieciwa <cieciwa@pld-linux.org>

Fri, 19 Apr 2002 11:11:11 +0000 (11:11 +0000)

committer cvs2git <feedback@pld-linux.org>

Sun, 24 Jun 2012 12:13:13 +0000 (12:13 +0000)
author cieciwa <cieciwa@pld-linux.org>
Fri, 19 Apr 2002 11:11:11 +0000 (11:11 +0000)
committer cvs2git <feedback@pld-linux.org>
Sun, 24 Jun 2012 12:13:13 +0000 (12:13 +0000)
diff --git a/sched-O1-2.4.18-pre8-K3.patch b/sched-O1-2.4.18-pre8-K3.patch

new file mode 100644 (file)

index 0000000..0c4f119
--- /dev/null
+++ b/sched-O1-2.4.18-pre8-K3.patch
@@ -0,0 +1,4187 @@
+--- linux/fs/proc/proc_misc.c.orig     Tue Feb  5 13:51:49 2002
++++ linux/fs/proc/proc_misc.c  Tue Feb  5 13:52:12 2002
+@@ -85,11 +85,11 @@
+       a = avenrun[0] + (FIXED_1/200);
+       b = avenrun[1] + (FIXED_1/200);
+       c = avenrun[2] + (FIXED_1/200);
+-      len = sprintf(page,"%d.%02d %d.%02d %d.%02d %d/%d %d\n",
++      len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
+               LOAD_INT(a), LOAD_FRAC(a),
+               LOAD_INT(b), LOAD_FRAC(b),
+               LOAD_INT(c), LOAD_FRAC(c),
+-              nr_running, nr_threads, last_pid);
++              nr_running(), nr_threads, last_pid);
+       return proc_calc_metrics(page, start, off, count, eof, len);
+ }
+ 
+@@ -101,7 +101,7 @@
+       int len;
+ 
+       uptime = jiffies;
+-      idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;
++      idle = init_task.times.tms_utime + init_task.times.tms_stime;
+ 
+       /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
+          that would overflow about every five days at HZ == 100.
+@@ -303,10 +303,10 @@
+       }
+ 
+       len += sprintf(page + len,
+-              "\nctxt %u\n"
++              "\nctxt %lu\n"
+               "btime %lu\n"
+               "processes %lu\n",
+-              kstat.context_swtch,
++              nr_context_switches(),
+               xtime.tv_sec - jif / HZ,
+               total_forks);
+ 
+--- linux/fs/proc/array.c.orig Tue Feb  5 13:51:45 2002
++++ linux/fs/proc/array.c      Tue Feb  5 13:52:12 2002
+@@ -335,9 +335,8 @@
+ 
+       /* scale priority and nice values from timeslices to -20..20 */
+       /* to make it look like a "normal" Unix priority/nice value  */
+-      priority = task->counter;
+-      priority = 20 - (priority * 10 + DEF_COUNTER / 2) / DEF_COUNTER;
+-      nice = task->nice;
++      priority = task_prio(task);
++      nice = task_nice(task);
+ 
+       read_lock(&tasklist_lock);
+       ppid = task->pid ? task->p_opptr->pid : 0;
+@@ -387,7 +386,7 @@
+               task->nswap,
+               task->cnswap,
+               task->exit_signal,
+-              task->processor);
++              task->cpu);
+       if(mm)
+               mmput(mm);
+       return res;
+--- linux/fs/nfs/pagelist.c.orig       Tue Feb  5 13:51:50 2002
++++ linux/fs/nfs/pagelist.c    Tue Feb  5 13:52:12 2002
+@@ -96,8 +96,7 @@
+                       continue;
+               if (signalled() && (server->flags & NFS_MOUNT_INTR))
+                       return ERR_PTR(-ERESTARTSYS);
+-              current->policy = SCHED_YIELD;
+-              schedule();
++              yield();
+       }
+ 
+       /* Initialize the request struct. Initially, we assume a
+--- linux/fs/ufs/truncate.c.orig       Tue Feb  5 13:51:53 2002
++++ linux/fs/ufs/truncate.c    Tue Feb  5 13:52:12 2002
+@@ -448,10 +448,7 @@
+               if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
+                       ufs_sync_inode (inode);
+               run_task_queue(&tq_disk);
+-              current->policy |= SCHED_YIELD;
+-              schedule ();
+-
+-
++              yield();
+       }
+       offset = inode->i_size & uspi->s_fshift;
+       if (offset) {
+--- linux/fs/reiserfs/buffer2.c.orig   Tue Feb  5 13:51:51 2002
++++ linux/fs/reiserfs/buffer2.c        Tue Feb  5 13:52:12 2002
+@@ -33,8 +33,7 @@
+                       buffer_journal_dirty(bh) ? ' ' : '!');
+     }
+     run_task_queue(&tq_disk);
+-    current->policy |= SCHED_YIELD;
+-    schedule();
++    yield();
+   }
+   if (repeat_counter > 30000000) {
+     reiserfs_warning("vs-3051: done waiting, ignore vs-3050 messages for (%b)\n", bh) ;
+@@ -52,11 +51,11 @@
+ struct buffer_head  * reiserfs_bread (struct super_block *super, int n_block, int n_size) 
+ {
+     struct buffer_head  *result;
+-    PROC_EXP( unsigned int ctx_switches = kstat.context_swtch );
++    PROC_EXP( unsigned int ctx_switches = nr_context_switches(); );
+ 
+     result = bread (super -> s_dev, n_block, n_size);
+     PROC_INFO_INC( super, breads );
+-    PROC_EXP( if( kstat.context_swtch != ctx_switches ) 
++    PROC_EXP( if( nr_context_switches() != ctx_switches ) 
+             PROC_INFO_INC( super, bread_miss ) );
+     return result;
+ }
+--- linux/fs/reiserfs/journal.c.orig   Tue Feb  5 13:51:53 2002
++++ linux/fs/reiserfs/journal.c        Tue Feb  5 13:52:12 2002
+@@ -149,8 +149,7 @@
+   }
+   bn = allocate_bitmap_node(p_s_sb) ;
+   if (!bn) {
+-    current->policy |= SCHED_YIELD ;
+-    schedule() ;
++    yield();
+     goto repeat ;
+   }
+   return bn ;
+--- linux/fs/jffs2/background.c.orig   Tue Feb  5 13:51:47 2002
++++ linux/fs/jffs2/background.c        Tue Feb  5 13:52:12 2002
+@@ -106,9 +106,6 @@
+ 
+         sprintf(current->comm, "jffs2_gcd_mtd%d", c->mtd->index);
+ 
+-      /* FIXME in the 2.2 backport */
+-      current->nice = 10;
+-
+       for (;;) {
+               spin_lock_irq(&current->sigmask_lock);
+               siginitsetinv (&current->blocked, sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT));
+--- linux/fs/jbd/journal.c.orig        Tue Feb  5 13:51:53 2002
++++ linux/fs/jbd/journal.c     Tue Feb  5 13:52:12 2002
+@@ -460,8 +460,7 @@
+                       printk (KERN_NOTICE __FUNCTION__
+                               ": ENOMEM at get_unused_buffer_head, "
+                               "trying again.\n");
+-                      current->policy |= SCHED_YIELD;
+-                      schedule();
++                      yield();
+               }
+       } while (!new_bh);
+       /* keep subsequent assertions sane */
+@@ -1541,8 +1540,7 @@
+                       last_warning = jiffies;
+               }
+               
+-              current->policy |= SCHED_YIELD;
+-              schedule();
++              yield();
+       }
+ }
+ 
+@@ -1600,8 +1598,7 @@
+                       last_warning = jiffies;
+               }
+               while (ret == 0) {
+-                      current->policy |= SCHED_YIELD;
+-                      schedule();
++                      yield();
+                       ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
+               }
+       }
+--- linux/fs/jbd/revoke.c.orig Tue Feb  5 13:51:53 2002
++++ linux/fs/jbd/revoke.c      Tue Feb  5 13:52:12 2002
+@@ -137,8 +137,7 @@
+       if (!journal_oom_retry)
+               return -ENOMEM;
+       jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n");
+-      current->policy |= SCHED_YIELD;
+-      schedule();
++      yield();
+       goto repeat;
+ }
+ 
+--- linux/fs/jbd/transaction.c.orig    Tue Feb  5 13:51:53 2002
++++ linux/fs/jbd/transaction.c Tue Feb  5 13:52:12 2002
+@@ -1379,8 +1379,7 @@
+               do {
+                       old_handle_count = transaction->t_handle_count;
+                       set_current_state(TASK_RUNNING);
+-                      current->policy |= SCHED_YIELD;
+-                      schedule();
++                      yield();
+               } while (old_handle_count != transaction->t_handle_count);
+       }
+ 
+--- linux/fs/binfmt_elf.c.orig Tue Feb  5 13:51:53 2002
++++ linux/fs/binfmt_elf.c      Tue Feb  5 13:52:12 2002
+@@ -1135,7 +1135,7 @@
+       psinfo.pr_state = i;
+       psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i];
+       psinfo.pr_zomb = psinfo.pr_sname == 'Z';
+-      psinfo.pr_nice = current->nice;
++      psinfo.pr_nice = task_nice(current);
+       psinfo.pr_flag = current->flags;
+       psinfo.pr_uid = NEW_TO_OLD_UID(current->uid);
+       psinfo.pr_gid = NEW_TO_OLD_GID(current->gid);
+--- linux/fs/buffer.c.orig     Tue Feb  5 13:51:53 2002
++++ linux/fs/buffer.c  Tue Feb  5 13:52:12 2002
+@@ -735,9 +735,8 @@
+       wakeup_bdflush();
+       try_to_free_pages(zone, GFP_NOFS, 0);
+       run_task_queue(&tq_disk);
+-      current->policy |= SCHED_YIELD;
+       __set_current_state(TASK_RUNNING);
+-      schedule();
++      sys_sched_yield();
+ }
+ 
+ void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
+--- linux/fs/locks.c.orig      Tue Feb  5 13:51:45 2002
++++ linux/fs/locks.c   Tue Feb  5 13:52:12 2002
+@@ -445,8 +445,7 @@
+                       /* Let the blocked process remove waiter from the
+                        * block list when it gets scheduled.
+                        */
+-                      current->policy |= SCHED_YIELD;
+-                      schedule();
++                      yield();
+               } else {
+                       /* Remove waiter from the block list, because by the
+                        * time it wakes up blocker won't exist any more.
+--- linux/init/main.c.orig     Tue Feb  5 13:51:53 2002
++++ linux/init/main.c  Tue Feb  5 13:52:12 2002
+@@ -485,8 +485,6 @@
+ extern void setup_arch(char **);
+ extern void cpu_idle(void);
+ 
+-unsigned long wait_init_idle;
+-
+ #ifndef CONFIG_SMP
+ 
+ #ifdef CONFIG_X86_LOCAL_APIC
+@@ -495,34 +493,24 @@
+       APIC_init_uniprocessor();
+ }
+ #else
+-#define smp_init()    do { } while (0)
++#define smp_init()      do { } while (0)
+ #endif
+ 
+ #else
+ 
+-
+ /* Called by boot processor to activate the rest. */
+ static void __init smp_init(void)
+ {
+       /* Get other processors into their bootup holding patterns. */
+       smp_boot_cpus();
+-      wait_init_idle = cpu_online_map;
+-      clear_bit(current->processor, &wait_init_idle); /* Don't wait on me! */
+ 
+       smp_threads_ready=1;
+       smp_commence();
+-
+-      /* Wait for the other cpus to set up their idle processes */
+-      printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle);
+-      while (wait_init_idle) {
+-              cpu_relax();
+-              barrier();
+-      }
+-      printk("All processors have done init_idle\n");
+ }
+ 
+ #endif
+ 
++
+ /*
+  * We need to finalize in a non-__init function or else race conditions
+  * between the root thread and the init thread may cause start_kernel to
+@@ -534,9 +522,8 @@
+ {
+       kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+       unlock_kernel();
+-      current->need_resched = 1;
+-      cpu_idle();
+-} 
++      cpu_idle();
++}
+ 
+ /*
+  *    Activate the first processor.
+@@ -617,14 +604,18 @@
+       ipc_init();
+ #endif
+       check_bugs();
++
+       printk("POSIX conformance testing by UNIFIX\n");
+ 
+-      /* 
+-       *      We count on the initial thread going ok 
+-       *      Like idlers init is an unlocked kernel thread, which will
+-       *      make syscalls (and thus be locked).
++      init_idle(current, smp_processor_id());
++      /*
++       *      We count on the initial thread going ok
++       *      Like idlers init is an unlocked kernel thread, which will
++       *      make syscalls (and thus be locked).
+        */
+       smp_init();
++
++      /* Do the rest non-__init'ed, we're now alive */
+       rest_init();
+ }
+ 
+@@ -785,12 +776,9 @@
+               int i, pid;
+ 
+               pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD);
+-              if (pid > 0) {
+-                      while (pid != wait(&i)) {
+-                              current->policy |= SCHED_YIELD;
+-                              schedule();
+-                      }
+-              }
++              if (pid > 0)
++                      while (pid != wait(&i))
++                              yield();
+               if (MAJOR(real_root_dev) != RAMDISK_MAJOR
+                    || MINOR(real_root_dev) != 0) {
+                       error = change_root(real_root_dev,"/initrd");
+--- linux/kernel/sched.c.orig  Tue Feb  5 13:51:51 2002
++++ linux/kernel/sched.c       Tue Feb  5 13:52:12 2002
+@@ -12,333 +12,306 @@
+  *  1998-12-28  Implemented better SMP scheduling by Ingo Molnar
+  */
+ 
+-/*
+- * 'sched.c' is the main kernel file. It contains scheduling primitives
+- * (sleep_on, wakeup, schedule etc) as well as a number of simple system
+- * call functions (type getpid()), which just extract a field from
+- * current-task
+- */
+-
+-#include <linux/config.h>
+ #include <linux/mm.h>
++#include <linux/nmi.h>
+ #include <linux/init.h>
++#include <asm/uaccess.h>
+ #include <linux/smp_lock.h>
+-#include <linux/nmi.h>
+ #include <linux/interrupt.h>
+-#include <linux/kernel_stat.h>
+-#include <linux/completion.h>
+-#include <linux/prefetch.h>
+-#include <linux/compiler.h>
+-
+-#include <asm/uaccess.h>
+ #include <asm/mmu_context.h>
+-
+-extern void timer_bh(void);
+-extern void tqueue_bh(void);
+-extern void immediate_bh(void);
++#include <linux/kernel_stat.h>
+ 
+ /*
+- * scheduler variables
++ * Priority of a process goes from 0 to 139. The 0-99
++ * priority range is allocated to RT tasks, the 100-139
++ * range is for SCHED_OTHER tasks. Priority values are
++ * inverted: lower p->prio value means higher priority.
+  */
+-
+-unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
+-
+-extern void mem_use(void);
++#define MAX_RT_PRIO           100
++#define MAX_PRIO              (MAX_RT_PRIO + 40)
+ 
+ /*
+- * Scheduling quanta.
+- *
+- * NOTE! The unix "nice" value influences how long a process
+- * gets. The nice value ranges from -20 to +19, where a -20
+- * is a "high-priority" task, and a "+10" is a low-priority
+- * task.
+- *
+- * We want the time-slice to be around 50ms or so, so this
+- * calculation depends on the value of HZ.
++ * Convert user-nice values [ -20 ... 0 ... 19 ]
++ * to static priority [ 100 ... 139 (MAX_PRIO-1) ],
++ * and back.
+  */
+-#if HZ < 200
+-#define TICK_SCALE(x) ((x) >> 2)
+-#elif HZ < 400
+-#define TICK_SCALE(x) ((x) >> 1)
+-#elif HZ < 800
+-#define TICK_SCALE(x) (x)
+-#elif HZ < 1600
+-#define TICK_SCALE(x) ((x) << 1)
+-#else
+-#define TICK_SCALE(x) ((x) << 2)
+-#endif
+-
+-#define NICE_TO_TICKS(nice)   (TICK_SCALE(20-(nice))+1)
++#define NICE_TO_PRIO(nice)    (MAX_RT_PRIO + (nice) + 20)
++#define PRIO_TO_NICE(prio)    ((prio) - MAX_RT_PRIO - 20)
++#define TASK_NICE(p)          PRIO_TO_NICE((p)->static_prio)
+ 
++/*
++ * 'User priority' is the nice value converted to something we
++ * can work with better when scaling various scheduler parameters,
++ * it's a [ 0 ... 39 ] range.
++ */
++#define USER_PRIO(p)          ((p)-MAX_RT_PRIO)
++#define TASK_USER_PRIO(p)     USER_PRIO((p)->static_prio)
++#define MAX_USER_PRIO         (USER_PRIO(MAX_PRIO))
+ 
+ /*
+- *    Init task must be ok at boot for the ix86 as we will check its signals
+- *    via the SMP irq return path.
++ * These are the 'tuning knobs' of the scheduler:
++ *
++ * Minimum timeslice is 10 msecs, default timeslice is 150 msecs,
++ * maximum timeslice is 300 msecs. Timeslices get refilled after
++ * they expire.
+  */
+- 
+-struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
++#define MIN_TIMESLICE         ( 10 * HZ / 1000)
++#define MAX_TIMESLICE         (300 * HZ / 1000)
++#define CHILD_PENALTY         95
++#define PARENT_PENALTY                100
++#define EXIT_WEIGHT           3
++#define PRIO_BONUS_RATIO      25
++#define INTERACTIVE_DELTA     2
++#define MAX_SLEEP_AVG         (2*HZ)
++#define STARVATION_LIMIT      (2*HZ)
+ 
+ /*
+- * The tasklist_lock protects the linked list of processes.
++ * If a task is 'interactive' then we reinsert it in the active
++ * array after it has expired its current timeslice. (it will not
++ * continue to run immediately, it will still roundrobin with
++ * other interactive tasks.)
+  *
+- * The runqueue_lock locks the parts that actually access
+- * and change the run-queues, and have to be interrupt-safe.
++ * This part scales the interactivity limit depending on niceness.
+  *
+- * If both locks are to be concurrently held, the runqueue_lock
+- * nests inside the tasklist_lock.
++ * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
++ * Here are a few examples of different nice levels:
+  *
+- * task->alloc_lock nests inside tasklist_lock.
++ *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
++ *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
++ *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
++ *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
++ *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
++ *
++ * (the X axis represents the possible -5 ... 0 ... +5 dynamic
++ *  priority range a task can explore, a value of '1' means the
++ *  task is rated interactive.)
++ *
++ * Ie. nice +19 tasks can never get 'interactive' enough to be
++ * reinserted into the active array. And only heavily CPU-hog nice -20
++ * tasks will be expired. Default nice 0 tasks are somewhere between,
++ * it takes some effort for them to get interactive, but it's not
++ * too hard.
+  */
+-spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* inner */
+-rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;        /* outer */
+ 
+-static LIST_HEAD(runqueue_head);
++#define SCALE(v1,v1_max,v2_max) \
++      (v1) * (v2_max) / (v1_max)
+ 
+-/*
+- * We align per-CPU scheduling data on cacheline boundaries,
+- * to prevent cacheline ping-pong.
+- */
+-static union {
+-      struct schedule_data {
+-              struct task_struct * curr;
+-              cycles_t last_schedule;
+-      } schedule_data;
+-      char __pad [SMP_CACHE_BYTES];
+-} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
++#define DELTA(p) \
++      (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
++              INTERACTIVE_DELTA)
+ 
+-#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
+-#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
++#define TASK_INTERACTIVE(p) \
++      ((p)->prio <= (p)->static_prio - DELTA(p))
+ 
+-struct kernel_stat kstat;
+-extern struct task_struct *child_reaper;
++/*
++ * TASK_TIMESLICE scales user-nice values [ -20 ... 19 ]
++ * to time slice values.
++ *
++ * The higher a process's priority, the bigger timeslices
++ * it gets during one round of execution. But even the lowest
++ * priority process gets MIN_TIMESLICE worth of execution time.
++ */
+ 
+-#ifdef CONFIG_SMP
++#define TASK_TIMESLICE(p) (MIN_TIMESLICE + \
++      ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/39))
+ 
+-#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
+-#define can_schedule(p,cpu) \
+-      ((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu))
++/*
++ * These are the runqueue data structures:
++ */
+ 
+-#else
++#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
+ 
+-#define idle_task(cpu) (&init_task)
+-#define can_schedule(p,cpu) (1)
++typedef struct runqueue runqueue_t;
+ 
+-#endif
+-
+-void scheduling_functions_start_here(void) { }
++struct prio_array {
++      int nr_active;
++      spinlock_t *lock;
++      runqueue_t *rq;
++      unsigned long bitmap[BITMAP_SIZE];
++      list_t queue[MAX_PRIO];
++};
+ 
+ /*
+- * This is the function that decides how desirable a process is..
+- * You can weigh different processes against each other depending
+- * on what CPU they've run on lately etc to try to handle cache
+- * and TLB miss penalties.
++ * This is the main, per-CPU runqueue data structure.
+  *
+- * Return values:
+- *     -1000: never select this
+- *         0: out of time, recalculate counters (but it might still be
+- *            selected)
+- *       +ve: "goodness" value (the larger, the better)
+- *     +1000: realtime process, select this.
++ * Locking rule: those places that want to lock multiple runqueues
++ * (such as the load balancing or the process migration code), lock
++ * acquire operations must be ordered by ascending &runqueue.
+  */
++struct runqueue {
++      spinlock_t lock;
++      unsigned long nr_running, nr_switches, expired_timestamp;
++      task_t *curr, *idle;
++      prio_array_t *active, *expired, arrays[2];
++      int prev_nr_running[NR_CPUS];
++} ____cacheline_aligned;
+ 
+-static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
+-{
+-      int weight;
++static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
+ 
+-      /*
+-       * select the current process after every other
+-       * runnable process, but before the idle thread.
+-       * Also, dont trigger a counter recalculation.
+-       */
+-      weight = -1;
+-      if (p->policy & SCHED_YIELD)
+-              goto out;
++#define cpu_rq(cpu)           (runqueues + (cpu))
++#define this_rq()             cpu_rq(smp_processor_id())
++#define task_rq(p)            cpu_rq((p)->cpu)
++#define cpu_curr(cpu)         (cpu_rq(cpu)->curr)
++#define rt_task(p)            ((p)->prio < MAX_RT_PRIO)
+ 
+-      /*
+-       * Non-RT process - normal case first.
+-       */
+-      if (p->policy == SCHED_OTHER) {
+-              /*
+-               * Give the process a first-approximation goodness value
+-               * according to the number of clock-ticks it has left.
+-               *
+-               * Don't do any other calculations if the time slice is
+-               * over..
+-               */
+-              weight = p->counter;
+-              if (!weight)
+-                      goto out;
+-                      
+-#ifdef CONFIG_SMP
+-              /* Give a largish advantage to the same processor...   */
+-              /* (this is equivalent to penalizing other processors) */
+-              if (p->processor == this_cpu)
+-                      weight += PROC_CHANGE_PENALTY;
+-#endif
++static inline runqueue_t *lock_task_rq(task_t *p, unsigned long *flags)
++{
++      struct runqueue *__rq;
+ 
+-              /* .. and a slight advantage to the current MM */
+-              if (p->mm == this_mm || !p->mm)
+-                      weight += 1;
+-              weight += 20 - p->nice;
+-              goto out;
++repeat_lock_task:
++      __rq = task_rq(p);
++      spin_lock_irqsave(&__rq->lock, *flags);
++      if (unlikely(__rq != task_rq(p))) {
++              spin_unlock_irqrestore(&__rq->lock, *flags);
++              goto repeat_lock_task;
+       }
++      return __rq;
++}
+ 
+-      /*
+-       * Realtime process, select the first one on the
+-       * runqueue (taking priorities within processes
+-       * into account).
+-       */
+-      weight = 1000 + p->rt_priority;
+-out:
+-      return weight;
++static inline void unlock_task_rq(runqueue_t *rq, unsigned long *flags)
++{
++      spin_unlock_irqrestore(&rq->lock, *flags);
+ }
+ 
+ /*
+- * the 'goodness value' of replacing a process on a given CPU.
+- * positive value means 'replace', zero or negative means 'dont'.
++ * Adding/removing a task to/from a priority array:
+  */
+-static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
++static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
+ {
+-      return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
++      array->nr_active--;
++      list_del_init(&p->run_list);
++      if (list_empty(array->queue + p->prio))
++              __clear_bit(p->prio, array->bitmap);
+ }
+ 
+-/*
+- * This is ugly, but reschedule_idle() is very timing-critical.
+- * We are called with the runqueue spinlock held and we must
+- * not claim the tasklist_lock.
+- */
+-static FASTCALL(void reschedule_idle(struct task_struct * p));
++static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
++{
++      list_add_tail(&p->run_list, array->queue + p->prio);
++      __set_bit(p->prio, array->bitmap);
++      array->nr_active++;
++      p->array = array;
++}
+ 
+-static void reschedule_idle(struct task_struct * p)
++static inline int effective_prio(task_t *p)
+ {
+-#ifdef CONFIG_SMP
+-      int this_cpu = smp_processor_id();
+-      struct task_struct *tsk, *target_tsk;
+-      int cpu, best_cpu, i, max_prio;
+-      cycles_t oldest_idle;
+-
+-      /*
+-       * shortcut if the woken up task's last CPU is
+-       * idle now.
+-       */
+-      best_cpu = p->processor;
+-      if (can_schedule(p, best_cpu)) {
+-              tsk = idle_task(best_cpu);
+-              if (cpu_curr(best_cpu) == tsk) {
+-                      int need_resched;
+-send_now_idle:
+-                      /*
+-                       * If need_resched == -1 then we can skip sending
+-                       * the IPI altogether, tsk->need_resched is
+-                       * actively watched by the idle thread.
+-                       */
+-                      need_resched = tsk->need_resched;
+-                      tsk->need_resched = 1;
+-                      if ((best_cpu != this_cpu) && !need_resched)
+-                              smp_send_reschedule(best_cpu);
+-                      return;
+-              }
+-      }
++      int bonus, prio;
+ 
+       /*
+-       * We know that the preferred CPU has a cache-affine current
+-       * process, lets try to find a new idle CPU for the woken-up
+-       * process. Select the least recently active idle CPU. (that
+-       * one will have the least active cache context.) Also find
+-       * the executing process which has the least priority.
+-       */
+-      oldest_idle = (cycles_t) -1;
+-      target_tsk = NULL;
+-      max_prio = 0;
++       * Here we scale the actual sleep average [0 .... MAX_SLEEP_AVG]
++       * into the -5 ... 0 ... +5 bonus/penalty range.
++       *
++       * We use 25% of the full 0...39 priority range so that:
++       *
++       * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
++       * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
++       *
++       * Both properties are important to certain workloads.
++       */
++      bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
++                      MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
+ 
+-      for (i = 0; i < smp_num_cpus; i++) {
+-              cpu = cpu_logical_map(i);
+-              if (!can_schedule(p, cpu))
+-                      continue;
+-              tsk = cpu_curr(cpu);
++      prio = p->static_prio - bonus;
++      if (prio < MAX_RT_PRIO)
++              prio = MAX_RT_PRIO;
++      if (prio > MAX_PRIO-1)
++              prio = MAX_PRIO-1;
++      return prio;
++}
++
++static inline void activate_task(task_t *p, runqueue_t *rq)
++{
++      unsigned long sleep_time = jiffies - p->sleep_timestamp;
++      prio_array_t *array = rq->active;
++
++      if (!rt_task(p) && sleep_time) {
+               /*
+-               * We use the first available idle CPU. This creates
+-               * a priority list between idle CPUs, but this is not
+-               * a problem.
++               * This code gives a bonus to interactive tasks. We update
++               * an 'average sleep time' value here, based on
++               * sleep_timestamp. The more time a task spends sleeping,
++               * the higher the average gets - and the higher the priority
++               * boost gets as well.
+                */
+-              if (tsk == idle_task(cpu)) {
+-#if defined(__i386__) && defined(CONFIG_SMP)
+-                        /*
+-                       * Check if two siblings are idle in the same
+-                       * physical package. Use them if found.
+-                       */
+-                      if (smp_num_siblings == 2) {
+-                              if (cpu_curr(cpu_sibling_map[cpu]) == 
+-                                  idle_task(cpu_sibling_map[cpu])) {
+-                                      oldest_idle = last_schedule(cpu);
+-                                      target_tsk = tsk;
+-                                      break;
+-                              }
+-                              
+-                        }
+-#endif                
+-                      if (last_schedule(cpu) < oldest_idle) {
+-                              oldest_idle = last_schedule(cpu);
+-                              target_tsk = tsk;
+-                      }
+-              } else {
+-                      if (oldest_idle == -1ULL) {
+-                              int prio = preemption_goodness(tsk, p, cpu);
+-
+-                              if (prio > max_prio) {
+-                                      max_prio = prio;
+-                                      target_tsk = tsk;
+-                              }
+-                      }
+-              }
+-      }
+-      tsk = target_tsk;
+-      if (tsk) {
+-              if (oldest_idle != -1ULL) {
+-                      best_cpu = tsk->processor;
+-                      goto send_now_idle;
+-              }
+-              tsk->need_resched = 1;
+-              if (tsk->processor != this_cpu)
+-                      smp_send_reschedule(tsk->processor);
++              p->sleep_avg += sleep_time;
++              if (p->sleep_avg > MAX_SLEEP_AVG)
++                      p->sleep_avg = MAX_SLEEP_AVG;
++              p->prio = effective_prio(p);
+       }
+-      return;
+-              
++      enqueue_task(p, array);
++      rq->nr_running++;
++}
+ 
+-#else /* UP */
+-      int this_cpu = smp_processor_id();
+-      struct task_struct *tsk;
+-
+-      tsk = cpu_curr(this_cpu);
+-      if (preemption_goodness(tsk, p, this_cpu) > 0)
+-              tsk->need_resched = 1;
+-#endif
++static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
++{
++      rq->nr_running--;
++      dequeue_task(p, p->array);
++      p->array = NULL;
+ }
+ 
++static inline void resched_task(task_t *p)
++{
++      int need_resched;
++
++      need_resched = p->need_resched;
++      wmb();
++      p->need_resched = 1;
++      if (!need_resched && (p->cpu != smp_processor_id()))
++              smp_send_reschedule(p->cpu);
++}
++
++#ifdef CONFIG_SMP
++
+ /*
+- * Careful!
+- *
+- * This has to add the process to the _beginning_ of the
+- * run-queue, not the end. See the comment about "This is
+- * subtle" in the scheduler proper..
++ * Wait for a process to unschedule. This is used by the exit() and
++ * ptrace() code.
+  */
+-static inline void add_to_runqueue(struct task_struct * p)
++void wait_task_inactive(task_t * p)
+ {
+-      list_add(&p->run_list, &runqueue_head);
+-      nr_running++;
++      unsigned long flags;
++      runqueue_t *rq;
++
++repeat:
++      rq = task_rq(p);
++      while (unlikely(rq->curr == p)) {
++              cpu_relax();
++              barrier();
++      }
++      rq = lock_task_rq(p, &flags);
++      if (unlikely(rq->curr == p)) {
++              unlock_task_rq(rq, &flags);
++              goto repeat;
++      }
++      unlock_task_rq(rq, &flags);
+ }
+ 
+-static inline void move_last_runqueue(struct task_struct * p)
++/*
++ * The SMP message passing code calls this function whenever
++ * the new task has arrived at the target CPU. We move the
++ * new task into the local runqueue.
++ *
++ * This function must be called with interrupts disabled.
++ */
++void sched_task_migrated(task_t *new_task)
+ {
+-      list_del(&p->run_list);
+-      list_add_tail(&p->run_list, &runqueue_head);
++      wait_task_inactive(new_task);
++      new_task->cpu = smp_processor_id();
++      wake_up_process(new_task);
+ }
+ 
+-static inline void move_first_runqueue(struct task_struct * p)
++/*
++ * Kick the remote CPU if the task is running currently,
++ * this code is used by the signal code to signal tasks
++ * which are in user-mode as quickly as possible.
++ *
++ * (Note that we do this lockless - if the task does anything
++ * while the message is in flight then it will notice the
++ * sigpending condition anyway.)
++ */
++void kick_if_running(task_t * p)
+ {
+-      list_del(&p->run_list);
+-      list_add(&p->run_list, &runqueue_head);
++      if (p == task_rq(p)->curr)
++              resched_task(p);
+ }
++#endif
+ 
+ /*
+  * Wake up a process. Put it on the run-queue if it's not
+@@ -348,392 +321,528 @@
+  * "current->state = TASK_RUNNING" to mark yourself runnable
+  * without the overhead of this.
+  */
+-static inline int try_to_wake_up(struct task_struct * p, int synchronous)
++static int try_to_wake_up(task_t * p, int synchronous)
+ {
+       unsigned long flags;
+       int success = 0;
++      runqueue_t *rq;
+ 
+-      /*
+-       * We want the common case fall through straight, thus the goto.
+-       */
+-      spin_lock_irqsave(&runqueue_lock, flags);
++      rq = lock_task_rq(p, &flags);
+       p->state = TASK_RUNNING;
+-      if (task_on_runqueue(p))
+-              goto out;
+-      add_to_runqueue(p);
+-      if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
+-              reschedule_idle(p);
+-      success = 1;
+-out:
+-      spin_unlock_irqrestore(&runqueue_lock, flags);
++      if (!p->array) {
++              activate_task(p, rq);
++              if ((rq->curr == rq->idle) || (p->prio < rq->curr->prio))
++                      resched_task(rq->curr);
++              success = 1;
++      }
++      unlock_task_rq(rq, &flags);
+       return success;
+ }
+ 
+-inline int wake_up_process(struct task_struct * p)
++int wake_up_process(task_t * p)
+ {
+       return try_to_wake_up(p, 0);
+ }
+ 
+-static void process_timeout(unsigned long __data)
++void wake_up_forked_process(task_t * p)
+ {
+-      struct task_struct * p = (struct task_struct *) __data;
++      runqueue_t *rq = this_rq();
+ 
+-      wake_up_process(p);
++      p->state = TASK_RUNNING;
++      if (!rt_task(p)) {
++              /*
++               * We decrease the sleep average of forking parents
++               * and children as well, to keep max-interactive tasks
++               * from forking tasks that are max-interactive.
++               */
++              current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
++              p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
++              p->prio = effective_prio(p);
++      }
++      spin_lock_irq(&rq->lock);
++      p->cpu = smp_processor_id();
++      activate_task(p, rq);
++      spin_unlock_irq(&rq->lock);
+ }
+ 
+-/**
+- * schedule_timeout - sleep until timeout
+- * @timeout: timeout value in jiffies
+- *
+- * Make the current task sleep until @timeout jiffies have
+- * elapsed. The routine will return immediately unless
+- * the current task state has been set (see set_current_state()).
+- *
+- * You can set the task state as follows -
+- *
+- * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
+- * pass before the routine returns. The routine will return 0
+- *
+- * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+- * delivered to the current task. In this case the remaining time
+- * in jiffies will be returned, or 0 if the timer expired in time
+- *
+- * The current task state is guaranteed to be TASK_RUNNING when this 
+- * routine returns.
+- *
+- * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
+- * the CPU away without a bound on the timeout. In this case the return
+- * value will be %MAX_SCHEDULE_TIMEOUT.
++/*
++ * Potentially available exiting-child timeslices are
++ * retrieved here - this way the parent does not get
++ * penalized for creating too many processes.
+  *
+- * In all cases the return value is guaranteed to be non-negative.
++ * (this cannot be used to 'generate' timeslices
++ * artificially, because any timeslice recovered here
++ * was given away by the parent in the first place.)
+  */
+-signed long schedule_timeout(signed long timeout)
++void sched_exit(task_t * p)
+ {
+-      struct timer_list timer;
+-      unsigned long expire;
++      __cli();
++      current->time_slice += p->time_slice;
++      if (unlikely(current->time_slice > MAX_TIMESLICE))
++              current->time_slice = MAX_TIMESLICE;
++      __sti();
++      /*
++       * If the child was a (relative-) CPU hog then decrease
++       * the sleep_avg of the parent as well.
++       */
++      if (p->sleep_avg < current->sleep_avg)
++              current->sleep_avg = (current->sleep_avg * EXIT_WEIGHT +
++                      p->sleep_avg) / (EXIT_WEIGHT + 1);
++}
+ 
+-      switch (timeout)
+-      {
+-      case MAX_SCHEDULE_TIMEOUT:
+-              /*
+-               * These two special cases are useful to be comfortable
+-               * in the caller. Nothing more. We could take
+-               * MAX_SCHEDULE_TIMEOUT from one of the negative value
+-               * but I' d like to return a valid offset (>=0) to allow
+-               * the caller to do everything it want with the retval.
+-               */
+-              schedule();
+-              goto out;
+-      default:
+-              /*
+-               * Another bit of PARANOID. Note that the retval will be
+-               * 0 since no piece of kernel is supposed to do a check
+-               * for a negative retval of schedule_timeout() (since it
+-               * should never happens anyway). You just have the printk()
+-               * that will tell you if something is gone wrong and where.
+-               */
+-              if (timeout < 0)
+-              {
+-                      printk(KERN_ERR "schedule_timeout: wrong timeout "
+-                             "value %lx from %p\n", timeout,
+-                             __builtin_return_address(0));
+-                      current->state = TASK_RUNNING;
+-                      goto out;
+-              }
+-      }
++#if CONFIG_SMP
++asmlinkage void schedule_tail(task_t *prev)
++{
++      spin_unlock_irq(&this_rq()->lock);
++}
++#endif
+ 
+-      expire = timeout + jiffies;
++static inline void context_switch(task_t *prev, task_t *next)
++{
++      struct mm_struct *mm = next->mm;
++      struct mm_struct *oldmm = prev->active_mm;
+ 
+-      init_timer(&timer);
+-      timer.expires = expire;
+-      timer.data = (unsigned long) current;
+-      timer.function = process_timeout;
++      prepare_to_switch();
+ 
+-      add_timer(&timer);
+-      schedule();
+-      del_timer_sync(&timer);
++      if (unlikely(!mm)) {
++              next->active_mm = oldmm;
++              atomic_inc(&oldmm->mm_count);
++              enter_lazy_tlb(oldmm, next, smp_processor_id());
++      } else
++              switch_mm(oldmm, mm, next, smp_processor_id());
+ 
+-      timeout = expire - jiffies;
++      if (unlikely(!prev->mm)) {
++              prev->active_mm = NULL;
++              mmdrop(oldmm);
++      }
+ 
+- out:
+-      return timeout < 0 ? 0 : timeout;
++      /*
++       * Here we just switch the register state and the stack. There are
++       * 3 processes affected by a context switch:
++       *
++       * prev ==> .... ==> (last => next)
++       *
++       * It's the 'much more previous' 'prev' that is on next's stack,
++       * but prev is set to (the just run) 'last' process by switch_to().
++       * This might sound slightly confusing but makes tons of sense.
++       */
++      switch_to(prev, next, prev);
+ }
+ 
+-/*
+- * schedule_tail() is getting called from the fork return path. This
+- * cleans up all remaining scheduler things, without impacting the
+- * common case.
+- */
+-static inline void __schedule_tail(struct task_struct *prev)
++unsigned long nr_running(void)
+ {
+-#ifdef CONFIG_SMP
+-      int policy;
+-
+-      /*
+-       * prev->policy can be written from here only before `prev'
+-       * can be scheduled (before setting prev->cpus_runnable to ~0UL).
+-       * Of course it must also be read before allowing prev
+-       * to be rescheduled, but since the write depends on the read
+-       * to complete, wmb() is enough. (the spin_lock() acquired
+-       * before setting cpus_runnable is not enough because the spin_lock()
+-       * common code semantics allows code outside the critical section
+-       * to enter inside the critical section)
+-       */
+-      policy = prev->policy;
+-      prev->policy = policy & ~SCHED_YIELD;
+-      wmb();
++      unsigned long i, sum = 0;
+ 
+-      /*
+-       * fast path falls through. We have to clear cpus_runnable before
+-       * checking prev->state to avoid a wakeup race. Protect against
+-       * the task exiting early.
+-       */
+-      task_lock(prev);
+-      task_release_cpu(prev);
+-      mb();
+-      if (prev->state == TASK_RUNNING)
+-              goto needs_resched;
++      for (i = 0; i < smp_num_cpus; i++)
++              sum += cpu_rq(cpu_logical_map(i))->nr_running;
+ 
+-out_unlock:
+-      task_unlock(prev);      /* Synchronise here with release_task() if prev is TASK_ZOMBIE */
+-      return;
++      return sum;
++}
+ 
+-      /*
+-       * Slow path - we 'push' the previous process and
+-       * reschedule_idle() will attempt to find a new
+-       * processor for it. (but it might preempt the
+-       * current process as well.) We must take the runqueue
+-       * lock and re-check prev->state to be correct. It might
+-       * still happen that this process has a preemption
+-       * 'in progress' already - but this is not a problem and
+-       * might happen in other circumstances as well.
+-       */
+-needs_resched:
+-      {
+-              unsigned long flags;
++unsigned long nr_context_switches(void)
++{
++      unsigned long i, sum = 0;
+ 
+-              /*
+-               * Avoid taking the runqueue lock in cases where
+-               * no preemption-check is necessery:
+-               */
+-              if ((prev == idle_task(smp_processor_id())) ||
+-                                              (policy & SCHED_YIELD))
+-                      goto out_unlock;
++      for (i = 0; i < smp_num_cpus; i++)
++              sum += cpu_rq(cpu_logical_map(i))->nr_switches;
+ 
+-              spin_lock_irqsave(&runqueue_lock, flags);
+-              if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev))
+-                      reschedule_idle(prev);
+-              spin_unlock_irqrestore(&runqueue_lock, flags);
+-              goto out_unlock;
+-      }
+-#else
+-      prev->policy &= ~SCHED_YIELD;
+-#endif /* CONFIG_SMP */
++      return sum;
+ }
+ 
+-asmlinkage void schedule_tail(struct task_struct *prev)
++#if CONFIG_SMP
++/*
++ * Lock the busiest runqueue as well, this_rq is locked already.
++ * Recalculate nr_running if we have to drop the runqueue lock.
++ */
++static inline unsigned int double_lock_balance(runqueue_t *this_rq,
++      runqueue_t *busiest, int this_cpu, int idle, unsigned int nr_running)
+ {
+-      __schedule_tail(prev);
++      if (unlikely(!spin_trylock(&busiest->lock))) {
++              if (busiest < this_rq) {
++                      spin_unlock(&this_rq->lock);
++                      spin_lock(&busiest->lock);
++                      spin_lock(&this_rq->lock);
++                      /* Need to recalculate nr_running */
++                      if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
++                              nr_running = this_rq->nr_running;
++                      else
++                              nr_running = this_rq->prev_nr_running[this_cpu];
++              } else
++                      spin_lock(&busiest->lock);
++      }
++      return nr_running;
+ }
+ 
+ /*
+- *  'schedule()' is the scheduler function. It's a very simple and nice
+- * scheduler: it's not perfect, but certainly works for most things.
+- *
+- * The goto is "interesting".
++ * Current runqueue is empty, or rebalance tick: if there is an
++ * inbalance (current runqueue is too short) then pull from
++ * busiest runqueue(s).
+  *
+- *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
+- * tasks can run. It can not be killed, and it cannot sleep. The 'state'
+- * information in task[0] is never used.
++ * We call this with the current runqueue locked,
++ * irqs disabled.
+  */
+-asmlinkage void schedule(void)
++static void load_balance(runqueue_t *this_rq, int idle)
+ {
+-      struct schedule_data * sched_data;
+-      struct task_struct *prev, *next, *p;
+-      struct list_head *tmp;
+-      int this_cpu, c;
++      int imbalance, nr_running, load, max_load,
++              idx, i, this_cpu = smp_processor_id();
++      task_t *next = this_rq->idle, *tmp;
++      runqueue_t *busiest, *rq_src;
++      prio_array_t *array;
++      list_t *head, *curr;
+ 
++      /*
++       * We search all runqueues to find the most busy one.
++       * We do this lockless to reduce cache-bouncing overhead,
++       * we re-check the 'best' source CPU later on again, with
++       * the lock held.
++       *
++       * We fend off statistical fluctuations in runqueue lengths by
++       * saving the runqueue length during the previous load-balancing
++       * operation and using the smaller one the current and saved lengths.
++       * If a runqueue is long enough for a longer amount of time then
++       * we recognize it and pull tasks from it.
++       *
++       * The 'current runqueue length' is a statistical maximum variable,
++       * for that one we take the longer one - to avoid fluctuations in
++       * the other direction. So for a load-balance to happen it needs
++       * stable long runqueue on the target CPU and stable short runqueue
++       * on the local runqueue.
++       *
++       * We make an exception if this CPU is about to become idle - in
++       * that case we are less picky about moving a task across CPUs and
++       * take what can be taken.
++       */
++      if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
++              nr_running = this_rq->nr_running;
++      else
++              nr_running = this_rq->prev_nr_running[this_cpu];
+ 
+-      spin_lock_prefetch(&runqueue_lock);
++      busiest = NULL;
++      max_load = 1;
++      for (i = 0; i < smp_num_cpus; i++) {
++              rq_src = cpu_rq(cpu_logical_map(i));
++              if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i]))
++                      load = rq_src->nr_running;
++              else
++                      load = this_rq->prev_nr_running[i];
++              this_rq->prev_nr_running[i] = rq_src->nr_running;
++
++              if ((load > max_load) && (rq_src != this_rq)) {
++                      busiest = rq_src;
++                      max_load = load;
++              }
++      }
+ 
+-      if (!current->active_mm) BUG();
+-need_resched_back:
+-      prev = current;
+-      this_cpu = prev->processor;
++      if (likely(!busiest))
++              return;
+ 
+-      if (unlikely(in_interrupt())) {
+-              printk("Scheduling in interrupt\n");
+-              BUG();
+-      }
++      imbalance = (max_load - nr_running) / 2;
+ 
+-      release_kernel_lock(prev, this_cpu);
++      /* It needs an at least ~25% imbalance to trigger balancing. */
++      if (!idle && (imbalance < (max_load + 3)/4))
++              return;
+ 
++      nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running);
+       /*
+-       * 'sched_data' is protected by the fact that we can run
+-       * only one process per CPU.
++       * Make sure nothing changed since we checked the
++       * runqueue length.
+        */
+-      sched_data = & aligned_data[this_cpu].schedule_data;
++      if (busiest->nr_running <= this_rq->nr_running + 1)
++              goto out_unlock;
+ 
+-      spin_lock_irq(&runqueue_lock);
++      /*
++       * We first consider expired tasks. Those will likely not be
++       * executed in the near future, and they are most likely to
++       * be cache-cold, thus switching CPUs has the least effect
++       * on them.
++       */
++      if (busiest->expired->nr_active)
++              array = busiest->expired;
++      else
++              array = busiest->active;
+ 
+-      /* move an exhausted RR process to be last.. */
+-      if (unlikely(prev->policy == SCHED_RR))
+-              if (!prev->counter) {
+-                      prev->counter = NICE_TO_TICKS(prev->nice);
+-                      move_last_runqueue(prev);
++new_array:
++      /* Start searching at priority 0: */
++      idx = 0;
++skip_bitmap:
++      if (!idx)
++              idx = sched_find_first_bit(array->bitmap);
++      else
++              idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
++      if (idx == MAX_PRIO) {
++              if (array == busiest->expired) {
++                      array = busiest->active;
++                      goto new_array;
+               }
+-
+-      switch (prev->state) {
+-              case TASK_INTERRUPTIBLE:
+-                      if (signal_pending(prev)) {
+-                              prev->state = TASK_RUNNING;
+-                              break;
+-                      }
+-              default:
+-                      del_from_runqueue(prev);
+-              case TASK_RUNNING:;
++              goto out_unlock;
+       }
+-      prev->need_resched = 0;
+-
+-      /*
+-       * this is the scheduler proper:
+-       */
+ 
+-repeat_schedule:
+-      /*
+-       * Default process to select..
+-       */
+-      next = idle_task(this_cpu);
+-      c = -1000;
+-      list_for_each(tmp, &runqueue_head) {
+-              p = list_entry(tmp, struct task_struct, run_list);
+-              if (can_schedule(p, this_cpu)) {
+-                      int weight = goodness(p, this_cpu, prev->active_mm);
+-                      if (weight > c)
+-                              c = weight, next = p;
++      head = array->queue + idx;
++      curr = head->prev;
++skip_queue:
++      tmp = list_entry(curr, task_t, run_list);
++
++      /*
++       * We do not migrate tasks that are:
++       * 1) running (obviously), or
++       * 2) cannot be migrated to this CPU due to cpus_allowed, or
++       * 3) are cache-hot on their current CPU.
++       */
++
++#define CAN_MIGRATE_TASK(p,rq,this_cpu)                                       \
++      ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) &&        \
++              ((p) != (rq)->curr) &&                                  \
++                      (tmp->cpus_allowed & (1 << (this_cpu))))
++
++      if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
++              curr = curr->next;
++              if (curr != head)
++                      goto skip_queue;
++              idx++;
++              goto skip_bitmap;
++      }
++      next = tmp;
++      /*
++       * take the task out of the other runqueue and
++       * put it into this one:
++       */
++      dequeue_task(next, array);
++      busiest->nr_running--;
++      next->cpu = this_cpu;
++      this_rq->nr_running++;
++      enqueue_task(next, this_rq->active);
++      if (next->prio < current->prio)
++              current->need_resched = 1;
++      if (!idle && --imbalance) {
++              if (array == busiest->expired) {
++                      array = busiest->active;
++                      goto new_array;
+               }
+       }
++out_unlock:
++      spin_unlock(&busiest->lock);
++}
++
++/*
++ * One of the idle_cpu_tick() or the busy_cpu_tick() function will
++ * gets called every timer tick, on every CPU. Our balancing action
++ * frequency and balancing agressivity depends on whether the CPU is
++ * idle or not.
++ *
++ * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on
++ * systems with HZ=100, every 10 msecs.)
++ */
++#define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
++#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
++
++static inline void idle_tick(void)
++{
++      if (jiffies % IDLE_REBALANCE_TICK)
++              return;
++      spin_lock(&this_rq()->lock);
++      load_balance(this_rq(), 1);
++      spin_unlock(&this_rq()->lock);
++}
++
++#endif
+ 
+-      /* Do we need to re-calculate counters? */
+-      if (unlikely(!c)) {
+-              struct task_struct *p;
+-
+-              spin_unlock_irq(&runqueue_lock);
+-              read_lock(&tasklist_lock);
+-              for_each_task(p)
+-                      p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
+-              read_unlock(&tasklist_lock);
+-              spin_lock_irq(&runqueue_lock);
+-              goto repeat_schedule;
++/*
++ * We place interactive tasks back into the active array, if possible.
++ *
++ * To guarantee that this does not starve expired tasks we ignore the
++ * interactivity of a task if the first expired task had to wait more
++ * than a 'reasonable' amount of time. This deadline timeout is
++ * load-dependent, as the frequency of array switched decreases with
++ * increasing number of running tasks:
++ */
++#define EXPIRED_STARVING(rq) \
++              ((rq)->expired_timestamp && \
++              (jiffies - (rq)->expired_timestamp >= \
++                      STARVATION_LIMIT * ((rq)->nr_running) + 1))
++
++/*
++ * This function gets called by the timer code, with HZ frequency.
++ * We call it with interrupts disabled.
++ */
++void scheduler_tick(int user_tick, int system)
++{
++      int cpu = smp_processor_id();
++      runqueue_t *rq = this_rq();
++      task_t *p = current;
++
++      if (p == rq->idle) {
++              if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
++                      kstat.per_cpu_system[cpu] += system;
++#if CONFIG_SMP
++              idle_tick();
++#endif
++              return;
+       }
++      if (TASK_NICE(p) > 0)
++              kstat.per_cpu_nice[cpu] += user_tick;
++      else
++              kstat.per_cpu_user[cpu] += user_tick;
++      kstat.per_cpu_system[cpu] += system;
+ 
++      /* Task might have expired already, but not scheduled off yet */
++      if (p->array != rq->active) {
++              p->need_resched = 1;
++              return;
++      }
++      spin_lock(&rq->lock);
++      if (unlikely(rt_task(p))) {
++              /*
++               * RR tasks need a special form of timeslice management.
++               * FIFO tasks have no timeslices.
++               */
++              if ((p->policy == SCHED_RR) && !--p->time_slice) {
++                      p->time_slice = TASK_TIMESLICE(p);
++                      p->need_resched = 1;
++
++                      /* put it at the end of the queue: */
++                      dequeue_task(p, rq->active);
++                      enqueue_task(p, rq->active);
++              }
++              goto out;
++      }
+       /*
+-       * from this point on nothing can prevent us from
+-       * switching to the next task, save this fact in
+-       * sched_data.
+-       */
+-      sched_data->curr = next;
+-      task_set_cpu(next, this_cpu);
+-      spin_unlock_irq(&runqueue_lock);
+-
+-      if (unlikely(prev == next)) {
+-              /* We won't go through the normal tail, so do this by hand */
+-              prev->policy &= ~SCHED_YIELD;
+-              goto same_process;
++       * The task was running during this tick - update the
++       * time slice counter and the sleep average. Note: we
++       * do not update a process's priority until it either
++       * goes to sleep or uses up its timeslice. This makes
++       * it possible for interactive tasks to use up their
++       * timeslices at their highest priority levels.
++       */
++      if (p->sleep_avg)
++              p->sleep_avg--;
++      if (!--p->time_slice) {
++              dequeue_task(p, rq->active);
++              p->need_resched = 1;
++              p->prio = effective_prio(p);
++              p->time_slice = TASK_TIMESLICE(p);
++
++              if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
++                      if (!rq->expired_timestamp)
++                              rq->expired_timestamp = jiffies;
++                      enqueue_task(p, rq->expired);
++              } else
++                      enqueue_task(p, rq->active);
+       }
++out:
++#if CONFIG_SMP
++      if (!(jiffies % BUSY_REBALANCE_TICK))
++              load_balance(rq, 0);
++#endif
++      spin_unlock(&rq->lock);
++}
+ 
+-#ifdef CONFIG_SMP
+-      /*
+-       * maintain the per-process 'last schedule' value.
+-       * (this has to be recalculated even if we reschedule to
+-       * the same process) Currently this is only used on SMP,
+-       * and it's approximate, so we do not have to maintain
+-       * it while holding the runqueue spinlock.
+-       */
+-      sched_data->last_schedule = get_cycles();
++void scheduling_functions_start_here(void) { }
+ 
+-      /*
+-       * We drop the scheduler lock early (it's a global spinlock),
+-       * thus we have to lock the previous process from getting
+-       * rescheduled during switch_to().
+-       */
++/*
++ * 'schedule()' is the main scheduler function.
++ */
++asmlinkage void schedule(void)
++{
++      task_t *prev = current, *next;
++      runqueue_t *rq = this_rq();
++      prio_array_t *array;
++      list_t *queue;
++      int idx;
+ 
+-#endif /* CONFIG_SMP */
++      if (unlikely(in_interrupt()))
++              BUG();
++      release_kernel_lock(prev, smp_processor_id());
++      prev->sleep_timestamp = jiffies;
++      spin_lock_irq(&rq->lock);
+ 
+-      kstat.context_swtch++;
+-      /*
+-       * there are 3 processes which are affected by a context switch:
+-       *
+-       * prev == .... ==> (last => next)
+-       *
+-       * It's the 'much more previous' 'prev' that is on next's stack,
+-       * but prev is set to (the just run) 'last' process by switch_to().
+-       * This might sound slightly confusing but makes tons of sense.
+-       */
+-      prepare_to_switch();
+-      {
+-              struct mm_struct *mm = next->mm;
+-              struct mm_struct *oldmm = prev->active_mm;
+-              if (!mm) {
+-                      if (next->active_mm) BUG();
+-                      next->active_mm = oldmm;
+-                      atomic_inc(&oldmm->mm_count);
+-                      enter_lazy_tlb(oldmm, next, this_cpu);
+-              } else {
+-                      if (next->active_mm != mm) BUG();
+-                      switch_mm(oldmm, mm, next, this_cpu);
++      switch (prev->state) {
++      case TASK_INTERRUPTIBLE:
++              if (unlikely(signal_pending(prev))) {
++                      prev->state = TASK_RUNNING;
++                      break;
+               }
++      default:
++              deactivate_task(prev, rq);
++      case TASK_RUNNING:
++              ;
++      }
++#if CONFIG_SMP
++pick_next_task:
++#endif
++      if (unlikely(!rq->nr_running)) {
++#if CONFIG_SMP
++              load_balance(rq, 1);
++              if (rq->nr_running)
++                      goto pick_next_task;
++#endif
++              next = rq->idle;
++              rq->expired_timestamp = 0;
++              goto switch_tasks;
++      }
+ 
+-              if (!prev->mm) {
+-                      prev->active_mm = NULL;
+-                      mmdrop(oldmm);
+-              }
++      array = rq->active;
++      if (unlikely(!array->nr_active)) {
++              /*
++               * Switch the active and expired arrays.
++               */
++              rq->active = rq->expired;
++              rq->expired = array;
++              array = rq->active;
++              rq->expired_timestamp = 0;
+       }
+ 
+-      /*
+-       * This just switches the register state and the
+-       * stack.
+-       */
+-      switch_to(prev, next, prev);
+-      __schedule_tail(prev);
++      idx = sched_find_first_bit(array->bitmap);
++      queue = array->queue + idx;
++      next = list_entry(queue->next, task_t, run_list);
++
++switch_tasks:
++      prefetch(next);
++      prev->need_resched = 0;
++
++      if (likely(prev != next)) {
++              rq->nr_switches++;
++              rq->curr = next;
++              context_switch(prev, next);
++              /*
++               * The runqueue pointer might be from another CPU
++               * if the new task was last running on a different
++               * CPU - thus re-load it.
++               */
++              barrier();
++              rq = this_rq();
++      }
++      spin_unlock_irq(&rq->lock);
+ 
+-same_process:
+       reacquire_kernel_lock(current);
+-      if (current->need_resched)
+-              goto need_resched_back;
+       return;
+ }
+ 
+ /*
+- * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just wake everything
+- * up.  If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the
+- * non-exclusive tasks and one exclusive task.
++ * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
++ * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
++ * number) then we wake all the non-exclusive tasks and one exclusive task.
+  *
+  * There are circumstances in which we can try to wake a task which has already
+- * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns zero
+- * in this (rare) case, and we handle it by contonuing to scan the queue.
++ * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
++ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+  */
+ static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
+                                    int nr_exclusive, const int sync)
+ {
+       struct list_head *tmp;
+-      struct task_struct *p;
++      task_t *p;
+ 
+-      CHECK_MAGIC_WQHEAD(q);
+-      WQ_CHECK_LIST_HEAD(&q->task_list);
+-      
+       list_for_each(tmp,&q->task_list) {
+               unsigned int state;
+-                wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
++              wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+ 
+-              CHECK_MAGIC(curr->__magic);
+               p = curr->task;
+               state = p->state;
+-              if (state & mode) {
+-                      WQ_NOTE_WAKER(curr);
+-                      if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+-                              break;
+-              }
++              if ((state & mode) &&
++                              try_to_wake_up(p, sync) &&
++                              ((curr->flags & WQ_FLAG_EXCLUSIVE) &&
++                                      !--nr_exclusive))
++                      break;
+       }
+ }
+ 
+@@ -850,8 +959,71 @@
+       return timeout;
+ }
+ 
++/*
++ * Change the current task's CPU affinity. Migrate the process to a
++ * proper CPU and schedule away if the current CPU is removed from
++ * the allowed bitmask.
++ */
++void set_cpus_allowed(task_t *p, unsigned long new_mask)
++{
++      new_mask &= cpu_online_map;
++      if (!new_mask)
++              BUG();
++      if (p != current)
++              BUG();
++
++      p->cpus_allowed = new_mask;
++      /*
++       * Can the task run on the current CPU? If not then
++       * migrate the process off to a proper CPU.
++       */
++      if (new_mask & (1UL << smp_processor_id()))
++              return;
++#if CONFIG_SMP
++      current->state = TASK_UNINTERRUPTIBLE;
++      smp_migrate_task(__ffs(new_mask), current);
++
++      schedule();
++#endif
++}
++
+ void scheduling_functions_end_here(void) { }
+ 
++void set_user_nice(task_t *p, long nice)
++{
++      unsigned long flags;
++      prio_array_t *array;
++      runqueue_t *rq;
++
++      if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
++              return;
++      /*
++       * We have to be careful, if called from sys_setpriority(),
++       * the task might be in the middle of scheduling on another CPU.
++       */
++      rq = lock_task_rq(p, &flags);
++      if (rt_task(p)) {
++              p->static_prio = NICE_TO_PRIO(nice);
++              goto out_unlock;
++      }
++      array = p->array;
++      if (array)
++              dequeue_task(p, array);
++      p->static_prio = NICE_TO_PRIO(nice);
++      p->prio = NICE_TO_PRIO(nice);
++      if (array) {
++              enqueue_task(p, array);
++              /*
++               * If the task is running and lowered its priority,
++               * or increased its priority then reschedule its CPU:
++               */
++              if ((NICE_TO_PRIO(nice) < p->static_prio) || (p == rq->curr))
++                      resched_task(rq->curr);
++      }
++out_unlock:
++      unlock_task_rq(rq, &flags);
++}
++
+ #ifndef __alpha__
+ 
+ /*
+@@ -862,7 +1034,7 @@
+ 
+ asmlinkage long sys_nice(int increment)
+ {
+-      long newprio;
++      long nice;
+ 
+       /*
+        *      Setpriority might change our priority at the same moment.
+@@ -878,32 +1050,46 @@
+       if (increment > 40)
+               increment = 40;
+ 
+-      newprio = current->nice + increment;
+-      if (newprio < -20)
+-              newprio = -20;
+-      if (newprio > 19)
+-              newprio = 19;
+-      current->nice = newprio;
++      nice = PRIO_TO_NICE(current->static_prio) + increment;
++      if (nice < -20)
++              nice = -20;
++      if (nice > 19)
++              nice = 19;
++      set_user_nice(current, nice);
+       return 0;
+ }
+ 
+ #endif
+ 
+-static inline struct task_struct *find_process_by_pid(pid_t pid)
++/*
++ * This is the priority value as seen by users in /proc
++ *
++ * RT tasks are offset by -200. Normal tasks are centered
++ * around 0, value goes from -16 to +15.
++ */
++int task_prio(task_t *p)
+ {
+-      struct task_struct *tsk = current;
++      return p->prio - 100;
++}
+ 
+-      if (pid)
+-              tsk = find_task_by_pid(pid);
+-      return tsk;
++int task_nice(task_t *p)
++{
++      return TASK_NICE(p);
++}
++
++static inline task_t *find_process_by_pid(pid_t pid)
++{
++      return pid ? find_task_by_pid(pid) : current;
+ }
+ 
+-static int setscheduler(pid_t pid, int policy, 
+-                      struct sched_param *param)
++static int setscheduler(pid_t pid, int policy, struct sched_param *param)
+ {
+       struct sched_param lp;
+-      struct task_struct *p;
++      prio_array_t *array;
++      unsigned long flags;
++      runqueue_t *rq;
+       int retval;
++      task_t *p;
+ 
+       retval = -EINVAL;
+       if (!param || pid < 0)
+@@ -917,14 +1103,19 @@
+        * We play safe to avoid deadlocks.
+        */
+       read_lock_irq(&tasklist_lock);
+-      spin_lock(&runqueue_lock);
+ 
+       p = find_process_by_pid(pid);
+ 
+       retval = -ESRCH;
+       if (!p)
+-              goto out_unlock;
+-                      
++              goto out_unlock_tasklist;
++
++      /*
++       * To be able to change p->policy safely, the apropriate
++       * runqueue lock must be held.
++       */
++      rq = lock_task_rq(p, &flags);
++
+       if (policy < 0)
+               policy = p->policy;
+       else {
+@@ -945,30 +1136,36 @@
+               goto out_unlock;
+ 
+       retval = -EPERM;
+-      if ((policy == SCHED_FIFO || policy == SCHED_RR) && 
++      if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
+           !capable(CAP_SYS_NICE))
+               goto out_unlock;
+       if ((current->euid != p->euid) && (current->euid != p->uid) &&
+           !capable(CAP_SYS_NICE))
+               goto out_unlock;
+ 
++      array = p->array;
++      if (array)
++              deactivate_task(p, task_rq(p));
+       retval = 0;
+       p->policy = policy;
+       p->rt_priority = lp.sched_priority;
+-      if (task_on_runqueue(p))
+-              move_first_runqueue(p);
+-
+-      current->need_resched = 1;
++      if (rt_task(p))
++              p->prio = 99 - p->rt_priority;
++      else
++              p->prio = p->static_prio;
++      if (array)
++              activate_task(p, task_rq(p));
+ 
+ out_unlock:
+-      spin_unlock(&runqueue_lock);
++      unlock_task_rq(rq, &flags);
++out_unlock_tasklist:
+       read_unlock_irq(&tasklist_lock);
+ 
+ out_nounlock:
+       return retval;
+ }
+ 
+-asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, 
++asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
+                                     struct sched_param *param)
+ {
+       return setscheduler(pid, policy, param);
+@@ -981,7 +1178,7 @@
+ 
+ asmlinkage long sys_sched_getscheduler(pid_t pid)
+ {
+-      struct task_struct *p;
++      task_t *p;
+       int retval;
+ 
+       retval = -EINVAL;
+@@ -992,7 +1189,7 @@
+       read_lock(&tasklist_lock);
+       p = find_process_by_pid(pid);
+       if (p)
+-              retval = p->policy & ~SCHED_YIELD;
++              retval = p->policy;
+       read_unlock(&tasklist_lock);
+ 
+ out_nounlock:
+@@ -1001,7 +1198,7 @@
+ 
+ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
+ {
+-      struct task_struct *p;
++      task_t *p;
+       struct sched_param lp;
+       int retval;
+ 
+@@ -1032,42 +1229,64 @@
+ 
+ asmlinkage long sys_sched_yield(void)
+ {
++      task_t *prev = current, *next;
++      runqueue_t *rq = this_rq();
++      prio_array_t *array;
++      list_t *queue;
++
++      if (unlikely(prev->state != TASK_RUNNING)) {
++              schedule();
++              return 0;
++      }
++      release_kernel_lock(prev, smp_processor_id());
++      prev->sleep_timestamp = jiffies;
+       /*
+-       * Trick. sched_yield() first counts the number of truly 
+-       * 'pending' runnable processes, then returns if it's
+-       * only the current processes. (This test does not have
+-       * to be atomic.) In threaded applications this optimization
+-       * gets triggered quite often.
++       * Decrease the yielding task's priority by one, to avoid
++       * livelocks. This priority loss is temporary, it's recovered
++       * once the current timeslice expires.
++       *
++       * If priority is already MAX_PRIO-1 then we still
++       * roundrobin the task within the runlist.
+        */
++      spin_lock_irq(&rq->lock);
++      array = current->array;
++      /*
++       * If the task has reached maximum priority (or is a RT task)
++       * then just requeue the task to the end of the runqueue:
++       */
++      if (likely(current->prio == MAX_PRIO-1 || rt_task(current))) {
++              list_del(&current->run_list);
++              list_add_tail(&current->run_list, array->queue + current->prio);
++      } else {
++              list_del(&current->run_list);
++              if (list_empty(array->queue + current->prio))
++                      __clear_bit(current->prio, array->bitmap);
++              current->prio++;
++              list_add_tail(&current->run_list, array->queue + current->prio);
++              __set_bit(current->prio, array->bitmap);
++      }
++      /*
++       * Context-switch manually. This is equivalent to
++       * calling schedule(), but faster, because yield()
++       * knows lots of things that can be optimized away
++       * from the generic scheduler path:
++       */
++      queue = array->queue + sched_find_first_bit(array->bitmap);
++      next = list_entry(queue->next, task_t, run_list);
++      prefetch(next);
+ 
+-      int nr_pending = nr_running;
+-
+-#if CONFIG_SMP
+-      int i;
+-
+-      // Subtract non-idle processes running on other CPUs.
+-      for (i = 0; i < smp_num_cpus; i++) {
+-              int cpu = cpu_logical_map(i);
+-              if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
+-                      nr_pending--;
++      prev->need_resched = 0;
++      if (likely(prev != next)) {
++              rq->nr_switches++;
++              rq->curr = next;
++              context_switch(prev, next);
++              barrier();
++              rq = this_rq();
+       }
+-#else
+-      // on UP this process is on the runqueue as well
+-      nr_pending--;
+-#endif
+-      if (nr_pending) {
+-              /*
+-               * This process can only be rescheduled by us,
+-               * so this is safe without any locking.
+-               */
+-              if (current->policy == SCHED_OTHER)
+-                      current->policy |= SCHED_YIELD;
+-              current->need_resched = 1;
++      spin_unlock_irq(&rq->lock);
++
++      reacquire_kernel_lock(current);
+ 
+-              spin_lock_irq(&runqueue_lock);
+-              move_last_runqueue(current);
+-              spin_unlock_irq(&runqueue_lock);
+-      }
+       return 0;
+ }
+ 
+@@ -1105,7 +1324,7 @@
+ asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
+ {
+       struct timespec t;
+-      struct task_struct *p;
++      task_t *p;
+       int retval = -EINVAL;
+ 
+       if (pid < 0)
+@@ -1115,8 +1334,8 @@
+       read_lock(&tasklist_lock);
+       p = find_process_by_pid(pid);
+       if (p)
+-              jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice),
+-                                  &t);
++              jiffies_to_timespec(p->policy & SCHED_FIFO ?
++                                       0 : TASK_TIMESLICE(p), &t);
+       read_unlock(&tasklist_lock);
+       if (p)
+               retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
+@@ -1124,14 +1343,14 @@
+       return retval;
+ }
+ 
+-static void show_task(struct task_struct * p)
++static void show_task(task_t * p)
+ {
+       unsigned long free = 0;
+       int state;
+       static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
+ 
+       printk("%-13.13s ", p->comm);
+-      state = p->state ? ffz(~p->state) + 1 : 0;
++      state = p->state ? __ffs(p->state) + 1 : 0;
+       if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
+               printk(stat_nam[state]);
+       else
+@@ -1172,7 +1391,7 @@
+               printk(" (NOTLB)\n");
+ 
+       {
+-              extern void show_trace_task(struct task_struct *tsk);
++              extern void show_trace_task(task_t *tsk);
+               show_trace_task(p);
+       }
+ }
+@@ -1194,7 +1413,7 @@
+ 
+ void show_state(void)
+ {
+-      struct task_struct *p;
++      task_t *p;
+ 
+ #if (BITS_PER_LONG == 32)
+       printk("\n"
+@@ -1217,121 +1436,88 @@
+       read_unlock(&tasklist_lock);
+ }
+ 
+-/**
+- * reparent_to_init() - Reparent the calling kernel thread to the init task.
+- *
+- * If a kernel thread is launched as a result of a system call, or if
+- * it ever exits, it should generally reparent itself to init so that
+- * it is correctly cleaned up on exit.
+- *
+- * The various task state such as scheduling policy and priority may have
+- * been inherited fro a user process, so we reset them to sane values here.
+- *
+- * NOTE that reparent_to_init() gives the caller full capabilities.
+- */
+-void reparent_to_init(void)
++static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
+ {
+-      struct task_struct *this_task = current;
+-
+-      write_lock_irq(&tasklist_lock);
+-
+-      /* Reparent to init */
+-      REMOVE_LINKS(this_task);
+-      this_task->p_pptr = child_reaper;
+-      this_task->p_opptr = child_reaper;
+-      SET_LINKS(this_task);
+-
+-      /* Set the exit signal to SIGCHLD so we signal init on exit */
+-      this_task->exit_signal = SIGCHLD;
+-
+-      /* We also take the runqueue_lock while altering task fields
+-       * which affect scheduling decisions */
+-      spin_lock(&runqueue_lock);
+-
+-      this_task->ptrace = 0;
+-      this_task->nice = DEF_NICE;
+-      this_task->policy = SCHED_OTHER;
+-      /* cpus_allowed? */
+-      /* rt_priority? */
+-      /* signals? */
+-      this_task->cap_effective = CAP_INIT_EFF_SET;
+-      this_task->cap_inheritable = CAP_INIT_INH_SET;
+-      this_task->cap_permitted = CAP_FULL_SET;
+-      this_task->keep_capabilities = 0;
+-      memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim)));
+-      this_task->user = INIT_USER;
+-
+-      spin_unlock(&runqueue_lock);
+-      write_unlock_irq(&tasklist_lock);
++      if (rq1 == rq2)
++              spin_lock(&rq1->lock);
++      else {
++              if (rq1 < rq2) {
++                      spin_lock(&rq1->lock);
++                      spin_lock(&rq2->lock);
++              } else {
++                      spin_lock(&rq2->lock);
++                      spin_lock(&rq1->lock);
++              }
++      }
+ }
+ 
+-/*
+- *    Put all the gunge required to become a kernel thread without
+- *    attached user resources in one place where it belongs.
+- */
+-
+-void daemonize(void)
++static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
+ {
+-      struct fs_struct *fs;
+-
+-
+-      /*
+-       * If we were started as result of loading a module, close all of the
+-       * user space pages.  We don't need them, and if we didn't close them
+-       * they would be locked into memory.
+-       */
+-      exit_mm(current);
+-
+-      current->session = 1;
+-      current->pgrp = 1;
+-      current->tty = NULL;
+-
+-      /* Become as one with the init task */
+-
+-      exit_fs(current);       /* current->fs->count--; */
+-      fs = init_task.fs;
+-      current->fs = fs;
+-      atomic_inc(&fs->count);
+-      exit_files(current);
+-      current->files = init_task.files;
+-      atomic_inc(&current->files->count);
++      spin_unlock(&rq1->lock);
++      if (rq1 != rq2)
++              spin_unlock(&rq2->lock);
+ }
+ 
+-extern unsigned long wait_init_idle;
+-
+-void __init init_idle(void)
++void __init init_idle(task_t *idle, int cpu)
+ {
+-      struct schedule_data * sched_data;
+-      sched_data = &aligned_data[smp_processor_id()].schedule_data;
++      runqueue_t *idle_rq = cpu_rq(cpu), *rq = idle->array->rq;
++      unsigned long flags;
+ 
+-      if (current != &init_task && task_on_runqueue(current)) {
+-              printk("UGH! (%d:%d) was on the runqueue, removing.\n",
+-                      smp_processor_id(), current->pid);
+-              del_from_runqueue(current);
+-      }
+-      sched_data->curr = current;
+-      sched_data->last_schedule = get_cycles();
+-      clear_bit(current->processor, &wait_init_idle);
++      __save_flags(flags);
++      __cli();
++      double_rq_lock(idle_rq, rq);
++
++      idle_rq->curr = idle_rq->idle = idle;
++      deactivate_task(idle, rq);
++      idle->array = NULL;
++      idle->prio = MAX_PRIO;
++      idle->state = TASK_RUNNING;
++      idle->cpu = cpu;
++      double_rq_unlock(idle_rq, rq);
++      idle->need_resched = 1;
++      __restore_flags(flags);
+ }
+ 
+-extern void init_timervecs (void);
++extern void init_timervecs(void);
++extern void timer_bh(void);
++extern void tqueue_bh(void);
++extern void immediate_bh(void);
+ 
+ void __init sched_init(void)
+ {
++      runqueue_t *rq;
++      int i, j, k;
++
++      for (i = 0; i < NR_CPUS; i++) {
++              runqueue_t *rq = cpu_rq(i);
++              prio_array_t *array;
++
++              rq->active = rq->arrays + 0;
++              rq->expired = rq->arrays + 1;
++              spin_lock_init(&rq->lock);
++
++              for (j = 0; j < 2; j++) {
++                      array = rq->arrays + j;
++                      array->rq = rq;
++                      array->lock = &rq->lock;
++                      for (k = 0; k < MAX_PRIO; k++) {
++                              INIT_LIST_HEAD(array->queue + k);
++                              __clear_bit(k, array->bitmap);
++                      }
++                      // delimiter for bitsearch
++                      __set_bit(MAX_PRIO, array->bitmap);
++              }
++      }
+       /*
+        * We have to do a little magic to get the first
+        * process right in SMP mode.
+        */
+-      int cpu = smp_processor_id();
+-      int nr;
+-
+-      init_task.processor = cpu;
+-
+-      for(nr = 0; nr < PIDHASH_SZ; nr++)
+-              pidhash[nr] = NULL;
++      rq = this_rq();
++      rq->curr = current;
++      rq->idle = current;
++      wake_up_process(current);
+ 
+       init_timervecs();
+-
+       init_bh(TIMER_BH, timer_bh);
+       init_bh(TQUEUE_BH, tqueue_bh);
+       init_bh(IMMEDIATE_BH, immediate_bh);
+@@ -1340,5 +1526,5 @@
+        * The boot idle thread does lazy MMU switching as well:
+        */
+       atomic_inc(&init_mm.mm_count);
+-      enter_lazy_tlb(&init_mm, current, cpu);
++      enter_lazy_tlb(&init_mm, current, smp_processor_id());
+ }
+--- linux/kernel/exit.c.orig   Tue Feb  5 13:51:53 2002
++++ linux/kernel/exit.c        Tue Feb  5 13:52:12 2002
+@@ -27,49 +27,22 @@
+ 
+ static void release_task(struct task_struct * p)
+ {
+-      if (p != current) {
++      if (p == current)
++              BUG();
+ #ifdef CONFIG_SMP
+-              /*
+-               * Wait to make sure the process isn't on the
+-               * runqueue (active on some other CPU still)
+-               */
+-              for (;;) {
+-                      task_lock(p);
+-                      if (!task_has_cpu(p))
+-                              break;
+-                      task_unlock(p);
+-                      do {
+-                              cpu_relax();
+-                              barrier();
+-                      } while (task_has_cpu(p));
+-              }
+-              task_unlock(p);
++      wait_task_inactive(p);
+ #endif
+-              atomic_dec(&p->user->processes);
+-              free_uid(p->user);
+-              unhash_process(p);
+-
+-              release_thread(p);
+-              current->cmin_flt += p->min_flt + p->cmin_flt;
+-              current->cmaj_flt += p->maj_flt + p->cmaj_flt;
+-              current->cnswap += p->nswap + p->cnswap;
+-              /*
+-               * Potentially available timeslices are retrieved
+-               * here - this way the parent does not get penalized
+-               * for creating too many processes.
+-               *
+-               * (this cannot be used to artificially 'generate'
+-               * timeslices, because any timeslice recovered here
+-               * was given away by the parent in the first place.)
+-               */
+-              current->counter += p->counter;
+-              if (current->counter >= MAX_COUNTER)
+-                      current->counter = MAX_COUNTER;
+-              p->pid = 0;
+-              free_task_struct(p);
+-      } else {
+-              printk("task releasing itself\n");
+-      }
++      atomic_dec(&p->user->processes);
++      free_uid(p->user);
++      unhash_process(p);
++
++      release_thread(p);
++      current->cmin_flt += p->min_flt + p->cmin_flt;
++      current->cmaj_flt += p->maj_flt + p->cmaj_flt;
++      current->cnswap += p->nswap + p->cnswap;
++      sched_exit(p);
++      p->pid = 0;
++      free_task_struct(p);
+ }
+ 
+ /*
+@@ -147,6 +120,79 @@
+       }
+       read_unlock(&tasklist_lock);
+       return retval;
++}
++
++/**
++ * reparent_to_init() - Reparent the calling kernel thread to the init task.
++ *
++ * If a kernel thread is launched as a result of a system call, or if
++ * it ever exits, it should generally reparent itself to init so that
++ * it is correctly cleaned up on exit.
++ *
++ * The various task state such as scheduling policy and priority may have
++ * been inherited from a user process, so we reset them to sane values here.
++ *
++ * NOTE that reparent_to_init() gives the caller full capabilities.
++ */
++void reparent_to_init(void)
++{
++      write_lock_irq(&tasklist_lock);
++
++      /* Reparent to init */
++      REMOVE_LINKS(current);
++      current->p_pptr = child_reaper;
++      current->p_opptr = child_reaper;
++      SET_LINKS(current);
++
++      /* Set the exit signal to SIGCHLD so we signal init on exit */
++      current->exit_signal = SIGCHLD;
++
++      current->ptrace = 0;
++      if ((current->policy == SCHED_OTHER) && (task_nice(current) < 0))
++              set_user_nice(current, 0);
++      /* cpus_allowed? */
++      /* rt_priority? */
++      /* signals? */
++      current->cap_effective = CAP_INIT_EFF_SET;
++      current->cap_inheritable = CAP_INIT_INH_SET;
++      current->cap_permitted = CAP_FULL_SET;
++      current->keep_capabilities = 0;
++      memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
++      current->user = INIT_USER;
++
++      write_unlock_irq(&tasklist_lock);
++}
++
++/*
++ *    Put all the gunge required to become a kernel thread without
++ *    attached user resources in one place where it belongs.
++ */
++
++void daemonize(void)
++{
++      struct fs_struct *fs;
++
++
++      /*
++       * If we were started as result of loading a module, close all of the
++       * user space pages.  We don't need them, and if we didn't close them
++       * they would be locked into memory.
++       */
++      exit_mm(current);
++
++      current->session = 1;
++      current->pgrp = 1;
++      current->tty = NULL;
++
++      /* Become as one with the init task */
++
++      exit_fs(current);       /* current->fs->count--; */
++      fs = init_task.fs;
++      current->fs = fs;
++      atomic_inc(&fs->count);
++      exit_files(current);
++      current->files = init_task.files;
++      atomic_inc(&current->files->count);
+ }
+ 
+ /*
+--- linux/kernel/capability.c.orig     Sat Jun 24 06:06:37 2000
++++ linux/kernel/capability.c  Tue Feb  5 13:52:12 2002
+@@ -8,6 +8,8 @@
+ #include <linux/mm.h>
+ #include <asm/uaccess.h>
+ 
++unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
++
+ kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
+ 
+ /* Note: never hold tasklist_lock while spinning for this one */
+--- linux/kernel/timer.c.orig  Tue Feb  5 13:51:43 2002
++++ linux/kernel/timer.c       Tue Feb  5 13:52:12 2002
+@@ -25,6 +25,8 @@
+ 
+ #include <asm/uaccess.h>
+ 
++struct kernel_stat kstat;
++
+ /*
+  * Timekeeping variables
+  */
+@@ -582,18 +584,7 @@
+       int cpu = smp_processor_id(), system = user_tick ^ 1;
+ 
+       update_one_process(p, user_tick, system, cpu);
+-      if (p->pid) {
+-              if (--p->counter <= 0) {
+-                      p->counter = 0;
+-                      p->need_resched = 1;
+-              }
+-              if (p->nice > 0)
+-                      kstat.per_cpu_nice[cpu] += user_tick;
+-              else
+-                      kstat.per_cpu_user[cpu] += user_tick;
+-              kstat.per_cpu_system[cpu] += system;
+-      } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
+-              kstat.per_cpu_system[cpu] += system;
++      scheduler_tick(user_tick, system);
+ }
+ 
+ /*
+@@ -794,6 +785,89 @@
+ 
+ #endif
+ 
++static void process_timeout(unsigned long __data)
++{
++      wake_up_process((task_t *)__data);
++}
++
++/**
++ * schedule_timeout - sleep until timeout
++ * @timeout: timeout value in jiffies
++ *
++ * Make the current task sleep until @timeout jiffies have
++ * elapsed. The routine will return immediately unless
++ * the current task state has been set (see set_current_state()).
++ *
++ * You can set the task state as follows -
++ *
++ * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
++ * pass before the routine returns. The routine will return 0
++ *
++ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
++ * delivered to the current task. In this case the remaining time
++ * in jiffies will be returned, or 0 if the timer expired in time
++ *
++ * The current task state is guaranteed to be TASK_RUNNING when this 
++ * routine returns.
++ *
++ * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
++ * the CPU away without a bound on the timeout. In this case the return
++ * value will be %MAX_SCHEDULE_TIMEOUT.
++ *
++ * In all cases the return value is guaranteed to be non-negative.
++ */
++signed long schedule_timeout(signed long timeout)
++{
++      struct timer_list timer;
++      unsigned long expire;
++
++      switch (timeout)
++      {
++      case MAX_SCHEDULE_TIMEOUT:
++              /*
++               * These two special cases are useful to be comfortable
++               * in the caller. Nothing more. We could take
++               * MAX_SCHEDULE_TIMEOUT from one of the negative value
++               * but I' d like to return a valid offset (>=0) to allow
++               * the caller to do everything it want with the retval.
++               */
++              schedule();
++              goto out;
++      default:
++              /*
++               * Another bit of PARANOID. Note that the retval will be
++               * 0 since no piece of kernel is supposed to do a check
++               * for a negative retval of schedule_timeout() (since it
++               * should never happens anyway). You just have the printk()
++               * that will tell you if something is gone wrong and where.
++               */
++              if (timeout < 0)
++              {
++                      printk(KERN_ERR "schedule_timeout: wrong timeout "
++                             "value %lx from %p\n", timeout,
++                             __builtin_return_address(0));
++                      current->state = TASK_RUNNING;
++                      goto out;
++              }
++      }
++
++      expire = timeout + jiffies;
++
++      init_timer(&timer);
++      timer.expires = expire;
++      timer.data = (unsigned long) current;
++      timer.function = process_timeout;
++
++      add_timer(&timer);
++      schedule();
++      del_timer_sync(&timer);
++
++      timeout = expire - jiffies;
++
++ out:
++      return timeout < 0 ? 0 : timeout;
++}
++
+ /* Thread ID - the internal kernel "pid" */
+ asmlinkage long sys_gettid(void)
+ {
+@@ -840,4 +914,3 @@
+       }
+       return 0;
+ }
+-
+--- linux/kernel/fork.c.orig   Tue Feb  5 13:51:53 2002
++++ linux/kernel/fork.c        Tue Feb  5 13:52:12 2002
+@@ -28,7 +28,6 @@
+ 
+ /* The idle threads do not count.. */
+ int nr_threads;
+-int nr_running;
+ 
+ int max_threads;
+ unsigned long total_forks;    /* Handle normal Linux uptimes. */
+@@ -36,6 +35,8 @@
+ 
+ struct task_struct *pidhash[PIDHASH_SZ];
+ 
++rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;  /* outer */
++
+ void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
+ {
+       unsigned long flags;
+@@ -564,6 +565,7 @@
+           struct pt_regs *regs, unsigned long stack_size)
+ {
+       int retval;
++      unsigned long flags;
+       struct task_struct *p;
+       struct completion vfork;
+ 
+@@ -619,8 +621,7 @@
+       copy_flags(clone_flags, p);
+       p->pid = get_pid(clone_flags);
+ 
+-      p->run_list.next = NULL;
+-      p->run_list.prev = NULL;
++      INIT_LIST_HEAD(&p->run_list);
+ 
+       p->p_cptr = NULL;
+       init_waitqueue_head(&p->wait_chldexit);
+@@ -646,14 +647,15 @@
+ #ifdef CONFIG_SMP
+       {
+               int i;
+-              p->cpus_runnable = ~0UL;
+-              p->processor = current->processor;
++
+               /* ?? should we just memset this ?? */
+               for(i = 0; i < smp_num_cpus; i++)
+-                      p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
++                      p->per_cpu_utime[cpu_logical_map(i)] =
++                              p->per_cpu_stime[cpu_logical_map(i)] = 0;
+               spin_lock_init(&p->sigmask_lock);
+       }
+ #endif
++      p->array = NULL;
+       p->lock_depth = -1;             /* -1 = no lock */
+       p->start_time = jiffies;
+ 
+@@ -685,15 +687,27 @@
+       p->pdeath_signal = 0;
+ 
+       /*
+-       * "share" dynamic priority between parent and child, thus the
+-       * total amount of dynamic priorities in the system doesnt change,
+-       * more scheduling fairness. This is only important in the first
+-       * timeslice, on the long run the scheduling behaviour is unchanged.
+-       */
+-      p->counter = (current->counter + 1) >> 1;
+-      current->counter >>= 1;
+-      if (!current->counter)
+-              current->need_resched = 1;
++       * Share the timeslice between parent and child, thus the
++       * total amount of pending timeslices in the system doesnt change,
++       * resulting in more scheduling fairness.
++       */
++      __save_flags(flags);
++      __cli();
++      if (!current->time_slice)
++              BUG();
++      p->time_slice = (current->time_slice + 1) >> 1;
++      current->time_slice >>= 1;
++      if (!current->time_slice) {
++              /*
++               * This case is rare, it happens when the parent has only
++               * a single jiffy left from its timeslice. Taking the
++               * runqueue lock is not a problem.
++               */
++              current->time_slice = 1;
++              scheduler_tick(0,0);
++      }
++      p->sleep_timestamp = jiffies;
++      __restore_flags(flags);
+ 
+       /*
+        * Ok, add it to the run-queues and make it
+@@ -730,10 +744,23 @@
+       if (p->ptrace & PT_PTRACED)
+               send_sig(SIGSTOP, p, 1);
+ 
++#define RUN_CHILD_FIRST 1
++#if RUN_CHILD_FIRST
++      wake_up_forked_process(p);      /* do this last */
++#else
+       wake_up_process(p);             /* do this last */
++#endif
+       ++total_forks;
+       if (clone_flags & CLONE_VFORK)
+               wait_for_completion(&vfork);
++#if RUN_CHILD_FIRST
++      else
++              /*
++               * Let the child process run first, to avoid most of the
++               * COW overhead when the child exec()s afterwards.
++               */
++              current->need_resched = 1;
++#endif
+ 
+ fork_out:
+       return retval;
+--- linux/kernel/softirq.c.orig        Tue Feb  5 13:51:47 2002
++++ linux/kernel/softirq.c     Tue Feb  5 13:52:12 2002
+@@ -259,10 +259,9 @@
+ 
+       while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
+               current->state = TASK_RUNNING;
+-              do {
+-                      current->policy |= SCHED_YIELD;
+-                      schedule();
+-              } while (test_bit(TASKLET_STATE_SCHED, &t->state));
++              do
++                      sys_sched_yield();
++              while (test_bit(TASKLET_STATE_SCHED, &t->state));
+       }
+       tasklet_unlock_wait(t);
+       clear_bit(TASKLET_STATE_SCHED, &t->state);
+@@ -365,13 +364,13 @@
+       int cpu = cpu_logical_map(bind_cpu);
+ 
+       daemonize();
+-      current->nice = 19;
++      set_user_nice(current, 19);
+       sigfillset(&current->blocked);
+ 
+       /* Migrate to the right CPU */
+-      current->cpus_allowed = 1UL << cpu;
+-      while (smp_processor_id() != cpu)
+-              schedule();
++      set_cpus_allowed(current, 1UL << cpu);
++      if (cpu() != cpu)
++              BUG();
+ 
+       sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu);
+ 
+@@ -396,7 +395,7 @@
+       }
+ }
+ 
+-static __init int spawn_ksoftirqd(void)
++__init int spawn_ksoftirqd(void)
+ {
+       int cpu;
+ 
+@@ -405,14 +404,12 @@
+                                 CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0)
+                       printk("spawn_ksoftirqd() failed for cpu %d\n", cpu);
+               else {
+-                      while (!ksoftirqd_task(cpu_logical_map(cpu))) {
+-                              current->policy |= SCHED_YIELD;
+-                              schedule();
+-                      }
++                      while (!ksoftirqd_task(cpu_logical_map(cpu)))
++                              sys_sched_yield();
+               }
+       }
+ 
+       return 0;
+ }
+ 
+-__initcall(spawn_ksoftirqd);
++__initcall(spawn_ksoftirqd);
+--- linux/kernel/ptrace.c.orig Tue Feb  5 13:51:53 2002
++++ linux/kernel/ptrace.c      Tue Feb  5 13:52:12 2002
+@@ -31,20 +31,7 @@
+               if (child->state != TASK_STOPPED)
+                       return -ESRCH;
+ #ifdef CONFIG_SMP
+-              /* Make sure the child gets off its CPU.. */
+-              for (;;) {
+-                      task_lock(child);
+-                      if (!task_has_cpu(child))
+-                              break;
+-                      task_unlock(child);
+-                      do {
+-                              if (child->state != TASK_STOPPED)
+-                                      return -ESRCH;
+-                              barrier();
+-                              cpu_relax();
+-                      } while (task_has_cpu(child));
+-              }
+-              task_unlock(child);
++              wait_task_inactive(child);
+ #endif                
+       }
+ 
+--- linux/kernel/sys.c.orig    Tue Feb  5 13:51:53 2002
++++ linux/kernel/sys.c Tue Feb  5 13:52:12 2002
+@@ -220,10 +220,10 @@
+               }
+               if (error == -ESRCH)
+                       error = 0;
+-              if (niceval < p->nice && !capable(CAP_SYS_NICE))
++              if (niceval < task_nice(p) && !capable(CAP_SYS_NICE))
+                       error = -EACCES;
+               else
+-                      p->nice = niceval;
++                      set_user_nice(p, niceval);
+       }
+       read_unlock(&tasklist_lock);
+ 
+@@ -249,7 +249,7 @@
+               long niceval;
+               if (!proc_sel(p, which, who))
+                       continue;
+-              niceval = 20 - p->nice;
++              niceval = 20 - task_nice(p);
+               if (niceval > retval)
+                       retval = niceval;
+       }
+--- linux/kernel/signal.c.orig Tue Feb  5 13:51:49 2002
++++ linux/kernel/signal.c      Tue Feb  5 13:52:12 2002
+@@ -478,12 +478,9 @@
+        * process of changing - but no harm is done by that
+        * other than doing an extra (lightweight) IPI interrupt.
+        */
+-      spin_lock(&runqueue_lock);
+-      if (task_has_cpu(t) && t->processor != smp_processor_id())
+-              smp_send_reschedule(t->processor);
+-      spin_unlock(&runqueue_lock);
+-#endif /* CONFIG_SMP */
+-
++      if ((t->state == TASK_RUNNING) && (t->cpu != cpu()))
++              kick_if_running(t);
++#endif
+       if (t->state & TASK_INTERRUPTIBLE) {
+               wake_up_process(t);
+               return;
+--- linux/kernel/printk.c.orig Tue Feb  5 13:51:53 2002
++++ linux/kernel/printk.c      Tue Feb  5 13:52:12 2002
+@@ -26,6 +26,7 @@
+ #include <linux/module.h>
+ #include <linux/interrupt.h>                  /* For in_interrupt() */
+ #include <linux/config.h>
++#include <linux/delay.h>
+ 
+ #include <asm/uaccess.h>
+ 
+--- linux/kernel/ksyms.c.orig  Tue Feb  5 13:51:53 2002
++++ linux/kernel/ksyms.c       Tue Feb  5 13:52:12 2002
+@@ -437,6 +437,9 @@
+ EXPORT_SYMBOL(interruptible_sleep_on_timeout);
+ EXPORT_SYMBOL(schedule);
+ EXPORT_SYMBOL(schedule_timeout);
++EXPORT_SYMBOL(sys_sched_yield);
++EXPORT_SYMBOL(set_user_nice);
++EXPORT_SYMBOL(set_cpus_allowed);
+ EXPORT_SYMBOL(jiffies);
+ EXPORT_SYMBOL(xtime);
+ EXPORT_SYMBOL(do_gettimeofday);
+@@ -448,6 +451,7 @@
+ 
+ EXPORT_SYMBOL(kstat);
+ EXPORT_SYMBOL(nr_running);
++EXPORT_SYMBOL(nr_context_switches);
+ 
+ /* misc */
+ EXPORT_SYMBOL(panic);
+--- linux/mm/oom_kill.c.orig   Tue Feb  5 13:51:47 2002
++++ linux/mm/oom_kill.c        Tue Feb  5 13:52:12 2002
+@@ -82,7 +82,7 @@
+        * Niced processes are most likely less important, so double
+        * their badness points.
+        */
+-      if (p->nice > 0)
++      if (task_nice(p) > 0)
+               points *= 2;
+ 
+       /*
+@@ -149,7 +149,7 @@
+        * all the memory it needs. That way it should be able to
+        * exit() and clear out its resources quickly...
+        */
+-      p->counter = 5 * HZ;
++      p->time_slice = HZ;
+       p->flags |= PF_MEMALLOC | PF_MEMDIE;
+ 
+       /* This process has hardware access, be more careful. */
+@@ -188,8 +188,7 @@
+        * killing itself before someone else gets the chance to ask
+        * for more memory.
+        */
+-      current->policy |= SCHED_YIELD;
+-      schedule();
++      yield();
+       return;
+ }
+ 
+--- linux/mm/page_alloc.c.orig Tue Feb  5 13:51:53 2002
++++ linux/mm/page_alloc.c      Tue Feb  5 13:52:12 2002
+@@ -400,9 +400,8 @@
+               return NULL;
+ 
+       /* Yield for kswapd, and try again */
+-      current->policy |= SCHED_YIELD;
+       __set_current_state(TASK_RUNNING);
+-      schedule();
++      yield();
+       goto rebalance;
+ }
+ 
+--- linux/mm/highmem.c.orig    Tue Feb  5 13:51:51 2002
++++ linux/mm/highmem.c Tue Feb  5 13:52:12 2002
+@@ -354,9 +354,8 @@
+       /* we need to wait I/O completion */
+       run_task_queue(&tq_disk);
+ 
+-      current->policy |= SCHED_YIELD;
+       __set_current_state(TASK_RUNNING);
+-      schedule();
++      yield();
+       goto repeat_alloc;
+ }
+ 
+@@ -392,9 +391,8 @@
+       /* we need to wait I/O completion */
+       run_task_queue(&tq_disk);
+ 
+-      current->policy |= SCHED_YIELD;
+       __set_current_state(TASK_RUNNING);
+-      schedule();
++      yield();
+       goto repeat_alloc;
+ }
+ 
+--- linux/include/linux/sched.h.orig   Tue Feb  5 13:51:51 2002
++++ linux/include/linux/sched.h        Tue Feb  5 13:52:12 2002
+@@ -6,6 +6,7 @@
+ extern unsigned long event;
+ 
+ #include <linux/config.h>
++#include <linux/compiler.h>
+ #include <linux/binfmts.h>
+ #include <linux/threads.h>
+ #include <linux/kernel.h>
+@@ -42,6 +43,7 @@
+ #define CLONE_VFORK   0x00004000      /* set if the parent wants the child to wake it up on mm_release */
+ #define CLONE_PARENT  0x00008000      /* set if we want to have the same parent as the cloner */
+ #define CLONE_THREAD  0x00010000      /* Same thread group? */
++#define CLONE_NEWNS   0x00020000      /* New namespace group? */
+ 
+ #define CLONE_SIGNAL  (CLONE_SIGHAND | CLONE_THREAD)
+ 
+@@ -72,8 +74,9 @@
+ #define CT_TO_SECS(x) ((x) / HZ)
+ #define CT_TO_USECS(x)        (((x) % HZ) * 1000000/HZ)
+ 
+-extern int nr_running, nr_threads;
++extern int nr_threads;
+ extern int last_pid;
++extern unsigned long nr_running(void);
+ 
+ #include <linux/fs.h>
+ #include <linux/time.h>
+@@ -116,12 +119,6 @@
+ #define SCHED_FIFO            1
+ #define SCHED_RR              2
+ 
+-/*
+- * This is an additional bit set when we want to
+- * yield the CPU for one re-schedule..
+- */
+-#define SCHED_YIELD           0x10
+-
+ struct sched_param {
+       int sched_priority;
+ };
+@@ -139,17 +136,22 @@
+  * a separate lock).
+  */
+ extern rwlock_t tasklist_lock;
+-extern spinlock_t runqueue_lock;
+ extern spinlock_t mmlist_lock;
+ 
++typedef struct task_struct task_t;
++
+ extern void sched_init(void);
+-extern void init_idle(void);
++extern void init_idle(task_t *idle, int cpu);
+ extern void show_state(void);
+ extern void cpu_init (void);
+ extern void trap_init(void);
+ extern void update_process_times(int user);
+-extern void update_one_process(struct task_struct *p, unsigned long user,
++extern void update_one_process(task_t *p, unsigned long user,
+                              unsigned long system, int cpu);
++extern void scheduler_tick(int user_tick, int system);
++extern void sched_task_migrated(task_t *p);
++extern void smp_migrate_task(int cpu, task_t *task);
++extern unsigned long cache_decay_ticks;
+ 
+ #define       MAX_SCHEDULE_TIMEOUT    LONG_MAX
+ extern signed long FASTCALL(schedule_timeout(signed long timeout));
+@@ -166,6 +168,7 @@
+  */
+ #define NR_OPEN_DEFAULT BITS_PER_LONG
+ 
++struct namespace;
+ /*
+  * Open file table structure
+  */
+@@ -278,6 +281,8 @@
+ extern struct user_struct root_user;
+ #define INIT_USER (&root_user)
+ 
++typedef struct prio_array prio_array_t;
++
+ struct task_struct {
+       /*
+        * offsets of these are hardcoded elsewhere - touch with care
+@@ -295,35 +300,26 @@
+ 
+       int lock_depth;         /* Lock depth */
+ 
+-/*
+- * offset 32 begins here on 32-bit platforms. We keep
+- * all fields in a single cacheline that are needed for
+- * the goodness() loop in schedule().
+- */
+-      long counter;
+-      long nice;
+-      unsigned long policy;
+-      struct mm_struct *mm;
+-      int processor;
+       /*
+-       * cpus_runnable is ~0 if the process is not running on any
+-       * CPU. It's (1 << cpu) if it's running on a CPU. This mask
+-       * is updated under the runqueue lock.
+-       *
+-       * To determine whether a process might run on a CPU, this
+-       * mask is AND-ed with cpus_allowed.
++       * offset 32 begins here on 32-bit platforms.
+        */
+-      unsigned long cpus_runnable, cpus_allowed;
+-      /*
+-       * (only the 'next' pointer fits into the cacheline, but
+-       * that's just fine.)
+-       */
+-      struct list_head run_list;
+-      unsigned long sleep_time;
++      unsigned int cpu;
++      int prio, static_prio;
++      list_t run_list;
++      prio_array_t *array;
++
++      unsigned long sleep_avg;
++      unsigned long sleep_timestamp;
++
++      unsigned long policy;
++      unsigned long cpus_allowed;
++      unsigned int time_slice;
++
++      task_t *next_task, *prev_task;
+ 
+-      struct task_struct *next_task, *prev_task;
+-      struct mm_struct *active_mm;
++      struct mm_struct *mm, *active_mm;
+       struct list_head local_pages;
++
+       unsigned int allocation_order, nr_local_pages;
+ 
+ /* task state */
+@@ -345,12 +341,12 @@
+        * older sibling, respectively.  (p->father can be replaced with 
+        * p->p_pptr->pid)
+        */
+-      struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
++      task_t *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
+       struct list_head thread_group;
+ 
+       /* PID hash table linkage. */
+-      struct task_struct *pidhash_next;
+-      struct task_struct **pidhash_pprev;
++      task_t *pidhash_next;
++      task_t **pidhash_pprev;
+ 
+       wait_queue_head_t wait_chldexit;        /* for wait4() */
+       struct completion *vfork_done;          /* for vfork() */
+@@ -389,6 +385,8 @@
+       struct fs_struct *fs;
+ /* open file information */
+       struct files_struct *files;
++/* namespace */
++      struct namespace *namespace;
+ /* signal handlers */
+       spinlock_t sigmask_lock;        /* Protects signal and blocked */
+       struct signal_struct *sig;
+@@ -446,10 +444,13 @@
+  */
+ #define _STK_LIM      (8*1024*1024)
+ 
+-#define DEF_COUNTER   (10*HZ/100)     /* 100 ms time slice */
+-#define MAX_COUNTER   (20*HZ/100)
+-#define DEF_NICE      (0)
++extern void set_cpus_allowed(task_t *p, unsigned long new_mask);
++extern void set_user_nice(task_t *p, long nice);
++extern int task_prio(task_t *p);
++extern int task_nice(task_t *p);
+ 
++asmlinkage long sys_sched_yield(void);
++#define yield() sys_sched_yield()
+ 
+ /*
+  * The default (Linux) execution domain.
+@@ -468,14 +469,14 @@
+     addr_limit:               KERNEL_DS,                                      \
+     exec_domain:      &default_exec_domain,                           \
+     lock_depth:               -1,                                             \
+-    counter:          DEF_COUNTER,                                    \
+-    nice:             DEF_NICE,                                       \
++    prio:             120,                                            \
++    static_prio:      120,                                            \
+     policy:           SCHED_OTHER,                                    \
++    cpus_allowed:     -1,                                             \
+     mm:                       NULL,                                           \
+     active_mm:                &init_mm,                                       \
+-    cpus_runnable:    -1,                                             \
+-    cpus_allowed:     -1,                                             \
+     run_list:         LIST_HEAD_INIT(tsk.run_list),                   \
++    time_slice:               HZ,                                             \
+     next_task:                &tsk,                                           \
+     prev_task:                &tsk,                                           \
+     p_opptr:          &tsk,                                           \
+@@ -509,24 +510,24 @@
+ #endif
+ 
+ union task_union {
+-      struct task_struct task;
++      task_t task;
+       unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
+ };
+ 
+ extern union task_union init_task_union;
+ 
+ extern struct   mm_struct init_mm;
+-extern struct task_struct *init_tasks[NR_CPUS];
++extern task_t *init_tasks[NR_CPUS];
+ 
+ /* PID hashing. (shouldnt this be dynamic?) */
+ #define PIDHASH_SZ (4096 >> 2)
+-extern struct task_struct *pidhash[PIDHASH_SZ];
++extern task_t *pidhash[PIDHASH_SZ];
+ 
+ #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
+ 
+-static inline void hash_pid(struct task_struct *p)
++static inline void hash_pid(task_t *p)
+ {
+-      struct task_struct **htable = &pidhash[pid_hashfn(p->pid)];
++      task_t **htable = &pidhash[pid_hashfn(p->pid)];
+ 
+       if((p->pidhash_next = *htable) != NULL)
+               (*htable)->pidhash_pprev = &p->pidhash_next;
+@@ -534,16 +535,16 @@
+       p->pidhash_pprev = htable;
+ }
+ 
+-static inline void unhash_pid(struct task_struct *p)
++static inline void unhash_pid(task_t *p)
+ {
+       if(p->pidhash_next)
+               p->pidhash_next->pidhash_pprev = p->pidhash_pprev;
+       *p->pidhash_pprev = p->pidhash_next;
+ }
+ 
+-static inline struct task_struct *find_task_by_pid(int pid)
++static inline task_t *find_task_by_pid(int pid)
+ {
+-      struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)];
++      task_t *p, **htable = &pidhash[pid_hashfn(pid)];
+ 
+       for(p = *htable; p && p->pid != pid; p = p->pidhash_next)
+               ;
+@@ -551,19 +552,6 @@
+       return p;
+ }
+ 
+-#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL)
+-
+-static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu)
+-{
+-      tsk->processor = cpu;
+-      tsk->cpus_runnable = 1UL << cpu;
+-}
+-
+-static inline void task_release_cpu(struct task_struct *tsk)
+-{
+-      tsk->cpus_runnable = ~0UL;
+-}
+-
+ /* per-UID process charging. */
+ extern struct user_struct * alloc_uid(uid_t);
+ extern void free_uid(struct user_struct *);
+@@ -590,7 +578,9 @@
+ extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
+ extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
+                                                   signed long timeout));
+-extern int FASTCALL(wake_up_process(struct task_struct * tsk));
++extern int FASTCALL(wake_up_process(task_t * tsk));
++extern void FASTCALL(wake_up_forked_process(task_t * tsk));
++extern void FASTCALL(sched_exit(task_t * p));
+ 
+ #define wake_up(x)                    __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
+ #define wake_up_nr(x, nr)             __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
+@@ -608,28 +598,28 @@
+ extern int in_egroup_p(gid_t);
+ 
+ extern void proc_caches_init(void);
+-extern void flush_signals(struct task_struct *);
+-extern void flush_signal_handlers(struct task_struct *);
++extern void flush_signals(task_t *);
++extern void flush_signal_handlers(task_t *);
+ extern int dequeue_signal(sigset_t *, siginfo_t *);
+ extern void block_all_signals(int (*notifier)(void *priv), void *priv,
+                             sigset_t *mask);
+ extern void unblock_all_signals(void);
+-extern int send_sig_info(int, struct siginfo *, struct task_struct *);
+-extern int force_sig_info(int, struct siginfo *, struct task_struct *);
++extern int send_sig_info(int, struct siginfo *, task_t *);
++extern int force_sig_info(int, struct siginfo *, task_t *);
+ extern int kill_pg_info(int, struct siginfo *, pid_t);
+ extern int kill_sl_info(int, struct siginfo *, pid_t);
+ extern int kill_proc_info(int, struct siginfo *, pid_t);
+-extern void notify_parent(struct task_struct *, int);
+-extern void do_notify_parent(struct task_struct *, int);
+-extern void force_sig(int, struct task_struct *);
+-extern int send_sig(int, struct task_struct *, int);
++extern void notify_parent(task_t *, int);
++extern void do_notify_parent(task_t *, int);
++extern void force_sig(int, task_t *);
++extern int send_sig(int, task_t *, int);
+ extern int kill_pg(pid_t, int, int);
+ extern int kill_sl(pid_t, int, int);
+ extern int kill_proc(pid_t, int, int);
+ extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *);
+ extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long);
+ 
+-static inline int signal_pending(struct task_struct *p)
++static inline int signal_pending(task_t *p)
+ {
+       return (p->sigpending != 0);
+ }
+@@ -668,7 +658,7 @@
+    This is required every time the blocked sigset_t changes.
+    All callers should have t->sigmask_lock.  */
+ 
+-static inline void recalc_sigpending(struct task_struct *t)
++static inline void recalc_sigpending(task_t *t)
+ {
+       t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
+ }
+@@ -775,16 +765,17 @@
+ extern int expand_fdset(struct files_struct *, int nr);
+ extern void free_fdset(fd_set *, int);
+ 
+-extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
++extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, task_t *, struct pt_regs *);
+ extern void flush_thread(void);
+ extern void exit_thread(void);
+ 
+-extern void exit_mm(struct task_struct *);
+-extern void exit_files(struct task_struct *);
+-extern void exit_sighand(struct task_struct *);
++extern void exit_mm(task_t *);
++extern void exit_files(task_t *);
++extern void exit_sighand(task_t *);
+ 
+ extern void reparent_to_init(void);
+ extern void daemonize(void);
++extern task_t *child_reaper;
+ 
+ extern int do_execve(char *, char **, char **, struct pt_regs *);
+ extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long);
+@@ -793,6 +784,9 @@
+ extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
+ extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
+ 
++extern void wait_task_inactive(task_t * p);
++extern void kick_if_running(task_t * p);
++
+ #define __wait_event(wq, condition)                                   \
+ do {                                                                  \
+       wait_queue_t __wait;                                            \
+@@ -871,24 +865,10 @@
+       for (p = &init_task ; (p = p->next_task) != &init_task ; )
+ 
+ #define next_thread(p) \
+-      list_entry((p)->thread_group.next, struct task_struct, thread_group)
+-
+-static inline void del_from_runqueue(struct task_struct * p)
+-{
+-      nr_running--;
+-      p->sleep_time = jiffies;
+-      list_del(&p->run_list);
+-      p->run_list.next = NULL;
+-}
+-
+-static inline int task_on_runqueue(struct task_struct *p)
+-{
+-      return (p->run_list.next != NULL);
+-}
++      list_entry((p)->thread_group.next, task_t, thread_group)
+ 
+-static inline void unhash_process(struct task_struct *p)
++static inline void unhash_process(task_t *p)
+ {
+-      if (task_on_runqueue(p)) BUG();
+       write_lock_irq(&tasklist_lock);
+       nr_threads--;
+       unhash_pid(p);
+@@ -898,12 +878,12 @@
+ }
+ 
+ /* Protects ->fs, ->files, ->mm, and synchronises with wait4().  Nests inside tasklist_lock */
+-static inline void task_lock(struct task_struct *p)
++static inline void task_lock(task_t *p)
+ {
+       spin_lock(&p->alloc_lock);
+ }
+ 
+-static inline void task_unlock(struct task_struct *p)
++static inline void task_unlock(task_t *p)
+ {
+       spin_unlock(&p->alloc_lock);
+ }
+--- linux/include/linux/list.h.orig    Tue Feb  5 13:51:51 2002
++++ linux/include/linux/list.h Tue Feb  5 13:52:12 2002
+@@ -19,6 +19,8 @@
+       struct list_head *next, *prev;
+ };
+ 
++typedef struct list_head list_t;
++
+ #define LIST_HEAD_INIT(name) { &(name), &(name) }
+ 
+ #define LIST_HEAD(name) \
+--- linux/include/linux/kernel_stat.h.orig     Tue Aug 21 14:26:23 2001
++++ linux/include/linux/kernel_stat.h  Tue Feb  5 13:52:12 2002
+@@ -32,10 +32,11 @@
+       unsigned int ipackets, opackets;
+       unsigned int ierrors, oerrors;
+       unsigned int collisions;
+-      unsigned int context_swtch;
+ };
+ 
+ extern struct kernel_stat kstat;
++
++extern unsigned long nr_context_switches(void);
+ 
+ #if !defined(CONFIG_ARCH_S390)
+ /*
+--- linux/include/linux/smp.h.orig     Sun Dec 31 20:10:17 2000
++++ linux/include/linux/smp.h  Tue Feb  5 13:52:12 2002
+@@ -86,6 +86,14 @@
+ #define cpu_number_map(cpu)                   0
+ #define smp_call_function(func,info,retry,wait)       ({ 0; })
+ #define cpu_online_map                                1
++static inline void smp_send_reschedule(int cpu) { }
++static inline void smp_send_reschedule_all(void) { }
+ 
+ #endif
++
++/*
++ * Common definitions:
++ */
++#define cpu()                                 smp_processor_id()
++
+ #endif
+--- linux/include/asm-i386/smp.h.orig  Tue Feb  5 13:51:51 2002
++++ linux/include/asm-i386/smp.h       Tue Feb  5 13:52:12 2002
+@@ -63,6 +63,7 @@
+ extern void smp_flush_tlb(void);
+ extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
+ extern void smp_send_reschedule(int cpu);
++extern void smp_send_reschedule_all(void);
+ extern void smp_invalidate_rcv(void);         /* Process an NMI */
+ extern void (*mtrr_hook) (void);
+ extern void zap_low_mappings (void);
+@@ -104,7 +105,7 @@
+  * so this is correct in the x86 case.
+  */
+ 
+-#define smp_processor_id() (current->processor)
++#define smp_processor_id() (current->cpu)
+ 
+ static __inline int hard_smp_processor_id(void)
+ {
+@@ -121,18 +122,6 @@
+ #endif /* !__ASSEMBLY__ */
+ 
+ #define NO_PROC_ID            0xFF            /* No processor magic marker */
+-
+-/*
+- *    This magic constant controls our willingness to transfer
+- *    a process across CPUs. Such a transfer incurs misses on the L1
+- *    cache, and on a P6 or P5 with multiple L2 caches L2 hits. My
+- *    gut feeling is this will vary by board in value. For a board
+- *    with separate L2 cache it probably depends also on the RSS, and
+- *    for a board with shared L2 cache it ought to decay fast as other
+- *    processes are run.
+- */
+- 
+-#define PROC_CHANGE_PENALTY   15              /* Schedule penalty */
+ 
+ #endif
+ #endif
+--- linux/include/asm-i386/bitops.h.orig       Tue Aug 21 14:26:16 2001
++++ linux/include/asm-i386/bitops.h    Tue Feb  5 13:52:12 2002
+@@ -75,6 +75,14 @@
+               :"=m" (ADDR)
+               :"Ir" (nr));
+ }
++
++static __inline__ void __clear_bit(int nr, volatile void * addr)
++{
++      __asm__ __volatile__(
++              "btrl %1,%0"
++              :"=m" (ADDR)
++              :"Ir" (nr));
++}
+ #define smp_mb__before_clear_bit()    barrier()
+ #define smp_mb__after_clear_bit()     barrier()
+ 
+@@ -284,6 +292,34 @@
+ }
+ 
+ /**
++ * find_first_bit - find the first set bit in a memory region
++ * @addr: The address to start the search at
++ * @size: The maximum size to search
++ *
++ * Returns the bit-number of the first set bit, not the number of the byte
++ * containing a bit.
++ */
++static __inline__ int find_first_bit(void * addr, unsigned size)
++{
++      int d0, d1;
++      int res;
++
++      /* This looks at memory. Mark it volatile to tell gcc not to move it around */
++      __asm__ __volatile__(
++              "xorl %%eax,%%eax\n\t"
++              "repe; scasl\n\t"
++              "jz 1f\n\t"
++              "leal -4(%%edi),%%edi\n\t"
++              "bsfl (%%edi),%%eax\n"
++              "1:\tsubl %%ebx,%%edi\n\t"
++              "shll $3,%%edi\n\t"
++              "addl %%edi,%%eax"
++              :"=a" (res), "=&c" (d0), "=&D" (d1)
++              :"1" ((size + 31) >> 5), "2" (addr), "b" (addr));
++      return res;
++}
++
++/**
+  * find_next_zero_bit - find the first zero bit in a memory region
+  * @addr: The address to base the search on
+  * @offset: The bitnumber to start searching at
+@@ -296,7 +332,7 @@
+       
+       if (bit) {
+               /*
+-               * Look for zero in first byte
++               * Look for zero in the first 32 bits.
+                */
+               __asm__("bsfl %1,%0\n\t"
+                       "jne 1f\n\t"
+@@ -317,6 +353,39 @@
+ }
+ 
+ /**
++ * find_next_bit - find the first set bit in a memory region
++ * @addr: The address to base the search on
++ * @offset: The bitnumber to start searching at
++ * @size: The maximum size to search
++ */
++static __inline__ int find_next_bit (void * addr, int size, int offset)
++{
++      unsigned long * p = ((unsigned long *) addr) + (offset >> 5);
++      int set = 0, bit = offset & 31, res;
++      
++      if (bit) {
++              /*
++               * Look for nonzero in the first 32 bits:
++               */
++              __asm__("bsfl %1,%0\n\t"
++                      "jne 1f\n\t"
++                      "movl $32, %0\n"
++                      "1:"
++                      : "=r" (set)
++                      : "r" (*p >> bit));
++              if (set < (32 - bit))
++                      return set + offset;
++              set = 32 - bit;
++              p++;
++      }
++      /*
++       * No set bit yet, search remaining full words for a bit
++       */
++      res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr));
++      return (offset + set + res);
++}
++
++/**
+  * ffz - find first zero in word.
+  * @word: The word to search
+  *
+@@ -327,6 +396,20 @@
+       __asm__("bsfl %1,%0"
+               :"=r" (word)
+               :"r" (~word));
++      return word;
++}
++
++/**
++ * __ffs - find first bit in word.
++ * @word: The word to search
++ *
++ * Undefined if no bit exists, so code should check against 0 first.
++ */
++static __inline__ unsigned long __ffs(unsigned long word)
++{
++      __asm__("bsfl %1,%0"
++              :"=r" (word)
++              :"rm" (word));
+       return word;
+ }
+ 
+--- linux/include/asm-i386/pgalloc.h.orig      Tue Feb  5 13:51:51 2002
++++ linux/include/asm-i386/pgalloc.h   Tue Feb  5 13:52:12 2002
+@@ -224,6 +224,7 @@
+ {
+       struct mm_struct *active_mm;
+       int state;
++      char __cacheline_padding[24];
+ };
+ extern struct tlb_state cpu_tlbstate[NR_CPUS];
+ 
+--- linux/include/asm-i386/mmu_context.h.orig  Tue Aug 21 14:26:23 2001
++++ linux/include/asm-i386/mmu_context.h       Tue Feb  5 13:52:12 2002
+@@ -7,6 +7,25 @@
+ #include <asm/pgalloc.h>
+ 
+ /*
++ * Every architecture must define this function. It's the fastest
++ * way of searching a 140-bit bitmap where the first 100 bits are
++ * unlikely to be set. It's guaranteed that at least one of the 140
++ * bits is cleared.
++ */
++static inline int sched_find_first_bit(unsigned long *b)
++{
++      if (unlikely(b[0]))
++              return __ffs(b[0]);
++      if (unlikely(b[1]))
++              return __ffs(b[1]) + 32;
++      if (unlikely(b[2]))
++              return __ffs(b[2]) + 64;
++      if (b[3])
++              return __ffs(b[3]) + 96;
++      return __ffs(b[4]) + 128;
++}
++
++/*
+  * possibly do the LDT unload here?
+  */
+ #define destroy_context(mm)           do { } while(0)
+@@ -27,13 +46,13 @@
+ 
+ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu)
+ {
+-      if (prev != next) {
++      if (likely(prev != next)) {
+               /* stop flush ipis for the previous mm */
+               clear_bit(cpu, &prev->cpu_vm_mask);
+               /*
+                * Re-load LDT if necessary
+                */
+-              if (prev->context.segments != next->context.segments)
++              if (unlikely(prev->context.segments != next->context.segments))
+                       load_LDT(next);
+ #ifdef CONFIG_SMP
+               cpu_tlbstate[cpu].state = TLBSTATE_OK;
+--- linux/include/asm-i386/hw_irq.h.orig       Tue Feb  5 13:51:40 2002
++++ linux/include/asm-i386/hw_irq.h    Tue Feb  5 13:52:12 2002
+@@ -41,7 +41,8 @@
+ #define ERROR_APIC_VECTOR     0xfe
+ #define INVALIDATE_TLB_VECTOR 0xfd
+ #define RESCHEDULE_VECTOR     0xfc
+-#define CALL_FUNCTION_VECTOR  0xfb
++#define TASK_MIGRATION_VECTOR 0xfb
++#define CALL_FUNCTION_VECTOR  0xfa
+ 
+ /*
+  * Local APIC timer IRQ vector is on a different priority level,
+--- linux/include/asm-i386/apic.h.orig Tue Feb  5 13:51:43 2002
++++ linux/include/asm-i386/apic.h      Tue Feb  5 13:52:12 2002
+@@ -79,6 +79,8 @@
+ extern void setup_apic_nmi_watchdog (void);
+ extern inline void nmi_watchdog_tick (struct pt_regs * regs);
+ extern int APIC_init_uniprocessor (void);
++extern void disable_APIC_timer(void);
++extern void enable_APIC_timer(void);
+ 
+ extern struct pm_dev *apic_pm_register(pm_dev_t, unsigned long, pm_callback);
+ extern void apic_pm_unregister(struct pm_dev*);
+--- linux/net/unix/af_unix.c.orig      Tue Feb  5 13:51:53 2002
++++ linux/net/unix/af_unix.c   Tue Feb  5 13:52:12 2002
+@@ -565,10 +565,8 @@
+                                     addr->hash)) {
+               write_unlock(&unix_table_lock);
+               /* Sanity yield. It is unusual case, but yet... */
+-              if (!(ordernum&0xFF)) {
+-                      current->policy |= SCHED_YIELD;
+-                      schedule();
+-              }
++              if (!(ordernum&0xFF))
++                      yield();
+               goto retry;
+       }
+       addr->hash ^= sk->type;
+--- linux/net/ipv4/tcp_output.c.orig   Tue Feb  5 13:51:51 2002
++++ linux/net/ipv4/tcp_output.c        Tue Feb  5 13:52:12 2002
+@@ -1009,8 +1009,7 @@
+                       skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
+                       if (skb)
+                               break;
+-                      current->policy |= SCHED_YIELD;
+-                      schedule();
++                      yield();
+               }
+ 
+               /* Reserve space for headers and prepare control bits. */
+--- linux/net/sunrpc/sched.c.orig      Tue Feb  5 13:51:53 2002
++++ linux/net/sunrpc/sched.c   Tue Feb  5 13:52:12 2002
+@@ -773,8 +773,7 @@
+               }
+               if (flags & RPC_TASK_ASYNC)
+                       return NULL;
+-              current->policy |= SCHED_YIELD;
+-              schedule();
++              yield();
+       } while (!signalled());
+ 
+       return NULL;
+@@ -1115,8 +1114,7 @@
+               __rpc_schedule();
+               if (all_tasks) {
+                       dprintk("rpciod_killall: waiting for tasks to exit\n");
+-                      current->policy |= SCHED_YIELD;
+-                      schedule();
++                      yield();
+               }
+       }
+ 
+@@ -1186,8 +1184,7 @@
+        * wait briefly before checking the process id.
+        */
+       current->sigpending = 0;
+-      current->policy |= SCHED_YIELD;
+-      schedule();
++      yield();
+       /*
+        * Display a message if we're going to wait longer.
+        */
+--- linux/net/sched/sch_generic.c.orig Fri Aug 18 19:26:25 2000
++++ linux/net/sched/sch_generic.c      Tue Feb  5 13:52:12 2002
+@@ -475,10 +475,8 @@
+ 
+       dev_watchdog_down(dev);
+ 
+-      while (test_bit(__LINK_STATE_SCHED, &dev->state)) {
+-              current->policy |= SCHED_YIELD;
+-              schedule();
+-      }
++      while (test_bit(__LINK_STATE_SCHED, &dev->state))
++              yield();
+ 
+       spin_unlock_wait(&dev->xmit_lock);
+ }
+--- linux/net/socket.c.orig    Tue Feb  5 13:51:51 2002
++++ linux/net/socket.c Tue Feb  5 13:52:12 2002
+@@ -148,8 +148,7 @@
+       while (atomic_read(&net_family_lockct) != 0) {
+               spin_unlock(&net_family_lock);
+ 
+-              current->policy |= SCHED_YIELD;
+-              schedule();
++              yield();
+ 
+               spin_lock(&net_family_lock);
+       }
+--- linux/drivers/net/slip.c.orig      Tue Feb  5 13:51:52 2002
++++ linux/drivers/net/slip.c   Tue Feb  5 13:52:12 2002
+@@ -1393,10 +1393,8 @@
+               /* First of all: check for active disciplines and hangup them.
+                */
+               do {
+-                      if (busy) {
+-                              current->counter = 0;
+-                              schedule();
+-                      }
++                      if (busy)
++                              sys_sched_yield();
+ 
+                       busy = 0;
+                       local_bh_disable();
+--- linux/drivers/block/loop.c.orig    Tue Feb  5 13:51:50 2002
++++ linux/drivers/block/loop.c Tue Feb  5 13:52:12 2002
+@@ -570,9 +570,6 @@
+       flush_signals(current);
+       spin_unlock_irq(&current->sigmask_lock);
+ 
+-      current->policy = SCHED_OTHER;
+-      current->nice = -20;
+-
+       spin_lock_irq(&lo->lo_lock);
+       lo->lo_state = Lo_bound;
+       atomic_inc(&lo->lo_pending);
+--- linux/drivers/char/mwave/mwavedd.c.orig    Tue Feb  5 13:51:44 2002
++++ linux/drivers/char/mwave/mwavedd.c Tue Feb  5 13:52:12 2002
+@@ -279,7 +279,6 @@
+                       pDrvData->IPCs[ipcnum].bIsHere = FALSE;
+                       pDrvData->IPCs[ipcnum].bIsEnabled = TRUE;
+       #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
+-                      current->nice = -20;    /* boost to provide priority timing */
+       #else
+                       current->priority = 0x28;       /* boost to provide priority timing */
+       #endif
+--- linux/drivers/char/drm-4.0/ffb_drv.c.orig  Tue Feb  5 13:51:51 2002
++++ linux/drivers/char/drm-4.0/ffb_drv.c       Tue Feb  5 13:52:12 2002
+@@ -710,8 +710,7 @@
+               /* Contention */
+               atomic_inc(&dev->total_sleeps);
+               current->state = TASK_INTERRUPTIBLE;
+-              current->policy |= SCHED_YIELD;
+-              schedule();
++              yield();
+               if (signal_pending(current)) {
+                       ret = -ERESTARTSYS;
+                       break;
+--- linux/drivers/char/drm-4.0/tdfx_drv.c.orig Tue Feb  5 13:51:52 2002
++++ linux/drivers/char/drm-4.0/tdfx_drv.c      Tue Feb  5 13:52:12 2002
+@@ -554,7 +554,6 @@
+                                       lock.context, current->pid, j,
+                                       dev->lock.lock_time, jiffies);
+                                 current->state = TASK_INTERRUPTIBLE;
+-                              current->policy |= SCHED_YIELD;
+                                 schedule_timeout(DRM_LOCK_SLICE-j);
+                               DRM_DEBUG("jiffies=%d\n", jiffies);
+                         }
+@@ -578,10 +577,7 @@
+ 
+                                 /* Contention */
+                         atomic_inc(&dev->total_sleeps);
+-#if 1
+-                      current->policy |= SCHED_YIELD;
+-#endif
+-                        schedule();
++                      yield();
+                         if (signal_pending(current)) {
+                                 ret = -ERESTARTSYS;
+                                 break;
+@@ -604,8 +600,7 @@
+                    when dev->last_context == lock.context
+                    NOTE WE HOLD THE LOCK THROUGHOUT THIS
+                    TIME! */
+-              current->policy |= SCHED_YIELD;
+-              schedule();
++              yield();
+               current->state = TASK_RUNNING;
+               remove_wait_queue(&dev->context_wait, &entry);
+               if (signal_pending(current)) {
+--- linux/drivers/ide/ataraid.c.orig   Tue Feb  5 13:51:46 2002
++++ linux/drivers/ide/ataraid.c        Tue Feb  5 13:52:12 2002
+@@ -123,8 +123,7 @@
+               ptr=kmalloc(sizeof(struct buffer_head),GFP_NOIO);
+               if (!ptr) {
+                       __set_current_state(TASK_RUNNING);
+-                      current->policy |= SCHED_YIELD;
+-                      schedule();             
++                      yield();
+               }
+       }
+       return ptr;
+@@ -139,8 +138,7 @@
+               ptr=kmalloc(sizeof(struct ataraid_bh_private),GFP_NOIO);
+               if (!ptr) {
+                       __set_current_state(TASK_RUNNING);
+-                      current->policy |= SCHED_YIELD;
+-                      schedule();             
++                      yield();
+               }
+       }
+       return ptr;
+--- linux/drivers/md/md.c.orig Tue Feb  5 13:51:52 2002
++++ linux/drivers/md/md.c      Tue Feb  5 13:52:12 2002
+@@ -2936,8 +2936,6 @@
+        * bdflush, otherwise bdflush will deadlock if there are too
+        * many dirty RAID5 blocks.
+        */
+-      current->policy = SCHED_OTHER;
+-      current->nice = -20;
+       md_unlock_kernel();
+ 
+       complete(thread->event);
+@@ -3387,11 +3385,6 @@
+              "(but not more than %d KB/sec) for reconstruction.\n",
+              sysctl_speed_limit_max);
+ 
+-      /*
+-       * Resync has low priority.
+-       */
+-      current->nice = 19;
+-
+       is_mddev_idle(mddev); /* this also initializes IO event counters */
+       for (m = 0; m < SYNC_MARKS; m++) {
+               mark[m] = jiffies;
+@@ -3469,16 +3462,13 @@
+               currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
+ 
+               if (currspeed > sysctl_speed_limit_min) {
+-                      current->nice = 19;
+-
+                       if ((currspeed > sysctl_speed_limit_max) ||
+                                       !is_mddev_idle(mddev)) {
+                               current->state = TASK_INTERRUPTIBLE;
+                               md_schedule_timeout(HZ/4);
+                               goto repeat;
+                       }
+-              } else
+-                      current->nice = -20;
++              }
+       }
+       printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
+       err = 0;
+--- linux/arch/i386/mm/fault.c.orig    Tue Feb  5 13:51:51 2002
++++ linux/arch/i386/mm/fault.c Tue Feb  5 13:52:12 2002
+@@ -86,8 +86,7 @@
+ 
+ out_of_memory:
+       if (current->pid == 1) {
+-              current->policy |= SCHED_YIELD;
+-              schedule();
++              yield();
+               goto survive;
+       }
+       goto bad_area;
+@@ -342,8 +341,7 @@
+ out_of_memory:
+       up_read(&mm->mmap_sem);
+       if (tsk->pid == 1) {
+-              tsk->policy |= SCHED_YIELD;
+-              schedule();
++              yield();
+               down_read(&mm->mmap_sem);
+               goto survive;
+       }
+--- linux/arch/i386/kernel/smpboot.c.orig      Tue Feb  5 13:51:49 2002
++++ linux/arch/i386/kernel/smpboot.c   Tue Feb  5 13:52:12 2002
+@@ -308,14 +308,14 @@
+                       if (tsc_values[i] < avg)
+                               realdelta = -realdelta;
+ 
+-                      printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n",
+-                              i, realdelta);
++                      printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", i, realdelta);
+               }
+ 
+               sum += delta;
+       }
+       if (!buggy)
+               printk("passed.\n");
++              ;
+ }
+ 
+ static void __init synchronize_tsc_ap (void)
+@@ -365,7 +365,7 @@
+        * (This works even if the APIC is not enabled.)
+        */
+       phys_id = GET_APIC_ID(apic_read(APIC_ID));
+-      cpuid = current->processor;
++      cpuid = cpu();
+       if (test_and_set_bit(cpuid, &cpu_online_map)) {
+               printk("huh, phys CPU#%d, CPU#%d already present??\n",
+                                       phys_id, cpuid);
+@@ -435,6 +435,7 @@
+        */
+       smp_store_cpu_info(cpuid);
+ 
++      disable_APIC_timer();
+       /*
+        * Allow the master to continue.
+        */
+@@ -465,6 +466,7 @@
+       smp_callin();
+       while (!atomic_read(&smp_commenced))
+               rep_nop();
++      enable_APIC_timer();
+       /*
+        * low-memory mappings have been cleared, flush them from
+        * the local TLBs too.
+@@ -803,16 +805,13 @@
+       if (!idle)
+               panic("No idle process for CPU %d", cpu);
+ 
+-      idle->processor = cpu;
+-      idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */
++      init_idle(idle, cpu);
+ 
+       map_cpu_to_boot_apicid(cpu, apicid);
+ 
+       idle->thread.eip = (unsigned long) start_secondary;
+ 
+-      del_from_runqueue(idle);
+       unhash_process(idle);
+-      init_tasks[cpu] = idle;
+ 
+       /* start_eip had better be page-aligned! */
+       start_eip = setup_trampoline();
+@@ -925,6 +924,7 @@
+ }
+ 
+ cycles_t cacheflush_time;
++unsigned long cache_decay_ticks;
+ 
+ static void smp_tune_scheduling (void)
+ {
+@@ -958,9 +958,13 @@
+               cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth;
+       }
+ 
++      cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000;
++
+       printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
+               (long)cacheflush_time/(cpu_khz/1000),
+               ((long)cacheflush_time*100/(cpu_khz/1000)) % 100);
++      printk("task migration cache decay timeout: %ld msecs.\n",
++              (cache_decay_ticks + 1) * 1000 / HZ);
+ }
+ 
+ /*
+@@ -1020,8 +1024,7 @@
+       map_cpu_to_boot_apicid(0, boot_cpu_apicid);
+ 
+       global_irq_holder = 0;
+-      current->processor = 0;
+-      init_idle();
++      current->cpu = 0;
+       smp_tune_scheduling();
+ 
+       /*
+--- linux/arch/i386/kernel/process.c.orig      Tue Feb  5 13:51:51 2002
++++ linux/arch/i386/kernel/process.c   Tue Feb  5 13:52:12 2002
+@@ -123,15 +123,12 @@
+ void cpu_idle (void)
+ {
+       /* endless idle loop with no priority at all */
+-      init_idle();
+-      current->nice = 20;
+-      current->counter = -100;
+ 
+       while (1) {
+               void (*idle)(void) = pm_idle;
+               if (!idle)
+                       idle = default_idle;
+-              while (!current->need_resched)
++              if (!current->need_resched)
+                       idle();
+               schedule();
+               check_pgt_cache();
+@@ -694,15 +691,17 @@
+       asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
+ 
+       /*
+-       * Restore %fs and %gs.
++       * Restore %fs and %gs if needed.
+        */
+-      loadsegment(fs, next->fs);
+-      loadsegment(gs, next->gs);
++      if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) {
++              loadsegment(fs, next->fs);
++              loadsegment(gs, next->gs);
++      }
+ 
+       /*
+        * Now maybe reload the debug registers
+        */
+-      if (next->debugreg[7]){
++      if (unlikely(next->debugreg[7])) {
+               loaddebug(next, 0);
+               loaddebug(next, 1);
+               loaddebug(next, 2);
+@@ -712,7 +711,7 @@
+               loaddebug(next, 7);
+       }
+ 
+-      if (prev->ioperm || next->ioperm) {
++      if (unlikely(prev->ioperm || next->ioperm)) {
+               if (next->ioperm) {
+                       /*
+                        * 4 cachelines copy ... not good, but not that
+--- linux/arch/i386/kernel/apic.c.orig Tue Feb  5 13:51:51 2002
++++ linux/arch/i386/kernel/apic.c      Tue Feb  5 13:52:12 2002
+@@ -796,8 +796,7 @@
+        */
+ 
+       slice = clocks / (smp_num_cpus+1);
+-      printk("cpu: %d, clocks: %d, slice: %d\n",
+-              smp_processor_id(), clocks, slice);
++      printk("cpu: %d, clocks: %d, slice: %d\n", smp_processor_id(), clocks, slice);
+ 
+       /*
+        * Wait for IRQ0's slice:
+@@ -820,8 +819,7 @@
+ 
+       __setup_APIC_LVTT(clocks);
+ 
+-      printk("CPU%d<T0:%d,T1:%d,D:%d,S:%d,C:%d>\n",
+-                      smp_processor_id(), t0, t1, delta, slice, clocks);
++      printk("CPU%d<T0:%d,T1:%d,D:%d,S:%d,C:%d>\n", smp_processor_id(), t0, t1, delta, slice, clocks);
+ 
+       __restore_flags(flags);
+ }
+@@ -922,6 +920,26 @@
+ 
+       /* and update all other cpus */
+       smp_call_function(setup_APIC_timer, (void *)calibration_result, 1, 1);
++}
++
++void __init disable_APIC_timer(void)
++{
++      if (using_apic_timer) {
++              unsigned long v;
++
++              v = apic_read(APIC_LVTT);
++              apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
++      }
++}
++
++void enable_APIC_timer(void)
++{
++      if (using_apic_timer) {
++              unsigned long v;
++
++              v = apic_read(APIC_LVTT);
++              apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
++      }
+ }
+ 
+ /*
+--- linux/arch/i386/kernel/nmi.c.orig  Tue Feb  5 13:51:36 2002
++++ linux/arch/i386/kernel/nmi.c       Tue Feb  5 13:52:12 2002
+@@ -283,7 +283,7 @@
+                        * to get a message out.
+                        */
+                       bust_spinlocks(1);
+-                      printk("NMI Watchdog detected LOCKUP on CPU%d, registers:\n", cpu);
++                      printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip);
+                       show_registers(regs);
+                       printk("console shuts up ...\n");
+                       console_silent();
+--- linux/arch/i386/kernel/smp.c.orig  Tue Feb  5 13:51:49 2002
++++ linux/arch/i386/kernel/smp.c       Tue Feb  5 13:52:12 2002
+@@ -105,7 +105,7 @@
+ /* The 'big kernel lock' */
+ spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+ 
+-struct tlb_state cpu_tlbstate[NR_CPUS] = {[0 ... NR_CPUS-1] = { &init_mm, 0 }};
++struct tlb_state cpu_tlbstate[NR_CPUS] __cacheline_aligned = {[0 ... NR_CPUS-1] = { &init_mm, 0, }};
+ 
+ /*
+  * the following functions deal with sending IPIs between CPUs.
+@@ -485,15 +485,54 @@
+       do_flush_tlb_all_local();
+ }
+ 
++static spinlock_t migration_lock = SPIN_LOCK_UNLOCKED;
++static task_t *new_task;
++
++/*
++ * This function sends a 'task migration' IPI to another CPU.
++ * Must be called from syscall contexts, with interrupts *enabled*.
++ */
++void smp_migrate_task(int cpu, task_t *p)
++{
++      /*
++       * The target CPU will unlock the migration spinlock:
++       */
++      spin_lock(&migration_lock);
++      new_task = p;
++      send_IPI_mask(1 << cpu, TASK_MIGRATION_VECTOR);
++}
++
++/*
++ * Task migration callback.
++ */
++asmlinkage void smp_task_migration_interrupt(void)
++{
++      task_t *p;
++
++      ack_APIC_irq();
++      p = new_task;
++      spin_unlock(&migration_lock);
++      sched_task_migrated(p);
++}
+ /*
+  * this function sends a 'reschedule' IPI to another CPU.
+  * it goes straight through and wastes no time serializing
+  * anything. Worst case is that we lose a reschedule ...
+  */
+-
+ void smp_send_reschedule(int cpu)
+ {
+       send_IPI_mask(1 << cpu, RESCHEDULE_VECTOR);
++}
++
++/*
++ * this function sends a reschedule IPI to all (other) CPUs.
++ * This should only be used if some 'global' task became runnable,
++ * such as a RT task, that must be handled now. The first CPU
++ * that manages to grab the task will run it.
++ */
++void smp_send_reschedule_all(void)
++{
++      send_IPI_allbutself(RESCHEDULE_VECTOR);
+ }
+ 
+ /*
+--- linux/arch/i386/kernel/i8259.c.orig        Tue Feb  5 13:51:36 2002
++++ linux/arch/i386/kernel/i8259.c     Tue Feb  5 13:52:12 2002
+@@ -79,6 +79,7 @@
+  * through the ICC by us (IPIs)
+  */
+ #ifdef CONFIG_SMP
++BUILD_SMP_INTERRUPT(task_migration_interrupt,TASK_MIGRATION_VECTOR)
+ BUILD_SMP_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
+ BUILD_SMP_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
+ BUILD_SMP_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
+@@ -472,6 +473,9 @@
+        * IPI, driven by wakeup.
+        */
+       set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
++
++      /* IPI for task migration */
++      set_intr_gate(TASK_MIGRATION_VECTOR, task_migration_interrupt);
+ 
+       /* IPI for invalidation */
+       set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+--- linux/arch/i386/kernel/entry.S.orig        Tue Feb  5 13:51:51 2002
++++ linux/arch/i386/kernel/entry.S     Tue Feb  5 13:52:12 2002
+@@ -77,7 +77,7 @@
+ exec_domain   = 16
+ need_resched  = 20
+ tsk_ptrace    = 24
+-processor     = 52
++cpu           = 32
+ 
+ ENOSYS = 38
+ 
+@@ -176,9 +176,11 @@
+ 
+ 
+ ENTRY(ret_from_fork)
++#if CONFIG_SMP
+       pushl %ebx
+       call SYMBOL_NAME(schedule_tail)
+       addl $4, %esp
++#endif
+       GET_CURRENT(%ebx)
+       testb $0x02,tsk_ptrace(%ebx)    # PT_TRACESYS
+       jne tracesys_exit
+--- linux/arch/i386/kernel/setup.c.orig        Tue Feb  5 13:51:51 2002
++++ linux/arch/i386/kernel/setup.c     Tue Feb  5 13:52:12 2002
+@@ -2924,9 +2924,10 @@
+       load_TR(nr);
+       load_LDT(&init_mm);
+ 
+-      /*
+-       * Clear all 6 debug registers:
+-       */
++      /* Clear %fs and %gs. */
++      asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
++
++      /* Clear all 6 debug registers: */
+ 
+ #define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) );
+
author	cieciwa <cieciwa@pld-linux.org>
	Fri, 19 Apr 2002 11:11:11 +0000 (11:11 +0000)
committer	cvs2git <feedback@pld-linux.org>
	Sun, 24 Jun 2012 12:13:13 +0000 (12:13 +0000)