sched-O1-2.4.17-J9.patch

   1 --- linux/fs/proc/proc_misc.c.orig      Sun Jan  6 13:55:55 2002
   2 +++ linux/fs/proc/proc_misc.c   Sun Jan  6 13:56:25 2002
   3 @@ -85,11 +85,11 @@
   4         a = avenrun[0] + (FIXED_1/200);
   5         b = avenrun[1] + (FIXED_1/200);
   6         c = avenrun[2] + (FIXED_1/200);
   7 -       len = sprintf(page,"%d.%02d %d.%02d %d.%02d %d/%d %d\n",
   8 +       len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
   9                 LOAD_INT(a), LOAD_FRAC(a),
  10                 LOAD_INT(b), LOAD_FRAC(b),
  11                 LOAD_INT(c), LOAD_FRAC(c),
  12 -               nr_running, nr_threads, last_pid);
  13 +               nr_running(), nr_threads, last_pid);
  14         return proc_calc_metrics(page, start, off, count, eof, len);
  15  }
  16
  17 @@ -101,7 +101,7 @@
  18         int len;
  19
  20         uptime = jiffies;
  21 -       idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;
  22 +       idle = init_task.times.tms_utime + init_task.times.tms_stime;
  23
  24         /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
  25            that would overflow about every five days at HZ == 100.
  26 @@ -303,10 +303,10 @@
  27         }
  28
  29         len += sprintf(page + len,
  30 -               "\nctxt %u\n"
  31 +               "\nctxt %lu\n"
  32                 "btime %lu\n"
  33                 "processes %lu\n",
  34 -               kstat.context_swtch,
  35 +               nr_context_switches(),
  36                 xtime.tv_sec - jif / HZ,
  37                 total_forks);
  38
  39 --- linux/fs/proc/array.c.orig  Sun Jan  6 13:55:51 2002
  40 +++ linux/fs/proc/array.c       Mon Jan  7 20:01:05 2002
  41 @@ -335,9 +335,12 @@
  42
  43         /* scale priority and nice values from timeslices to -20..20 */
  44         /* to make it look like a "normal" Unix priority/nice value  */
  45 -       priority = task->counter;
  46 -       priority = 20 - (priority * 10 + DEF_COUNTER / 2) / DEF_COUNTER;
  47 -       nice = task->nice;
  48 +       priority = task->prio;
  49 +       if (priority >= MAX_RT_PRIO)
  50 +               priority -= MAX_RT_PRIO;
  51 +       else
  52 +               priority = priority-100;
  53 +       nice = task->__nice;
  54
  55         read_lock(&tasklist_lock);
  56         ppid = task->pid ? task->p_opptr->pid : 0;
  57 @@ -387,7 +390,7 @@
  58                 task->nswap,
  59                 task->cnswap,
  60                 task->exit_signal,
  61 -               task->processor);
  62 +               task->cpu);
  63         if(mm)
  64                 mmput(mm);
  65         return res;
  66 --- linux/fs/nfs/pagelist.c.orig        Sun Jan  6 13:55:57 2002
  67 +++ linux/fs/nfs/pagelist.c     Sun Jan  6 13:56:25 2002
  68 @@ -96,8 +96,7 @@
  69                         continue;
  70                 if (signalled() && (server->flags & NFS_MOUNT_INTR))
  71                         return ERR_PTR(-ERESTARTSYS);
  72 -               current->policy = SCHED_YIELD;
  73 -               schedule();
  74 +               yield();
  75         }
  76
  77         /* Initialize the request struct. Initially, we assume a
  78 --- linux/fs/ufs/truncate.c.orig        Sun Jan  6 13:55:55 2002
  79 +++ linux/fs/ufs/truncate.c     Sun Jan  6 13:56:25 2002
  80 @@ -448,10 +448,7 @@
  81                 if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
  82                         ufs_sync_inode (inode);
  83                 run_task_queue(&tq_disk);
  84 -               current->policy |= SCHED_YIELD;
  85 -               schedule ();
  86 -
  87 -
  88 +               yield();
  89         }
  90         offset = inode->i_size & uspi->s_fshift;
  91         if (offset) {
  92 --- linux/fs/reiserfs/buffer2.c.orig    Sun Jan  6 13:55:57 2002
  93 +++ linux/fs/reiserfs/buffer2.c Sun Jan  6 13:56:25 2002
  94 @@ -33,8 +33,7 @@
  95                         buffer_journal_dirty(bh) ? ' ' : '!');
  96      }
  97      run_task_queue(&tq_disk);
  98 -    current->policy |= SCHED_YIELD;
  99 -    schedule();
 100 +    yield();
 101    }
 102    if (repeat_counter > 30000000) {
 103      reiserfs_warning("vs-3051: done waiting, ignore vs-3050 messages for (%b)\n", bh) ;
 104 @@ -52,11 +51,11 @@
 105  struct buffer_head  * reiserfs_bread (struct super_block *super, int n_block, int n_size)
 106  {
 107      struct buffer_head  *result;
 108 -    PROC_EXP( unsigned int ctx_switches = kstat.context_swtch );
 109 +    PROC_EXP( unsigned int ctx_switches = nr_context_switches(); );
 110
 111      result = bread (super -> s_dev, n_block, n_size);
 112      PROC_INFO_INC( super, breads );
 113 -    PROC_EXP( if( kstat.context_swtch != ctx_switches )
 114 +    PROC_EXP( if( nr_context_switches() != ctx_switches )
 115               PROC_INFO_INC( super, bread_miss ) );
 116      return result;
 117  }
 118 --- linux/fs/reiserfs/journal.c.orig    Sun Jan  6 13:55:57 2002
 119 +++ linux/fs/reiserfs/journal.c Sun Jan  6 13:56:25 2002
 120 @@ -149,8 +149,7 @@
 121    }
 122    bn = allocate_bitmap_node(p_s_sb) ;
 123    if (!bn) {
 124 -    current->policy |= SCHED_YIELD ;
 125 -    schedule() ;
 126 +    yield();
 127      goto repeat ;
 128    }
 129    return bn ;
 130 --- linux/fs/jffs2/background.c.orig    Sun Jan  6 13:55:53 2002
 131 +++ linux/fs/jffs2/background.c Sun Jan  6 13:56:25 2002
 132 @@ -106,9 +106,6 @@
 133
 134          sprintf(current->comm, "jffs2_gcd_mtd%d", c->mtd->index);
 135
 136 -       /* FIXME in the 2.2 backport */
 137 -       current->nice = 10;
 138 -
 139         for (;;) {
 140                 spin_lock_irq(&current->sigmask_lock);
 141                 siginitsetinv (&current->blocked, sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT));
 142 --- linux/fs/jbd/journal.c.orig Sun Jan  6 13:55:57 2002
 143 +++ linux/fs/jbd/journal.c      Sun Jan  6 13:56:25 2002
 144 @@ -460,8 +460,7 @@
 145                         printk (KERN_NOTICE __FUNCTION__
 146                                 ": ENOMEM at get_unused_buffer_head, "
 147                                 "trying again.\n");
 148 -                       current->policy |= SCHED_YIELD;
 149 -                       schedule();
 150 +                       yield();
 151                 }
 152         } while (!new_bh);
 153         /* keep subsequent assertions sane */
 154 @@ -1539,8 +1538,7 @@
 155                         last_warning = jiffies;
 156                 }
 157
 158 -               current->policy |= SCHED_YIELD;
 159 -               schedule();
 160 +               yield();
 161         }
 162  }
 163
 164 @@ -1598,8 +1596,7 @@
 165                         last_warning = jiffies;
 166                 }
 167                 while (ret == 0) {
 168 -                       current->policy |= SCHED_YIELD;
 169 -                       schedule();
 170 +                       yield();
 171                         ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
 172                 }
 173         }
 174 --- linux/fs/jbd/revoke.c.orig  Sun Jan  6 13:55:57 2002
 175 +++ linux/fs/jbd/revoke.c       Sun Jan  6 13:56:25 2002
 176 @@ -137,8 +137,7 @@
 177         if (!journal_oom_retry)
 178                 return -ENOMEM;
 179         jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n");
 180 -       current->policy |= SCHED_YIELD;
 181 -       schedule();
 182 +       yield();
 183         goto repeat;
 184  }
 185
 186 --- linux/fs/jbd/transaction.c.orig     Sun Jan  6 13:55:57 2002
 187 +++ linux/fs/jbd/transaction.c  Sun Jan  6 13:56:25 2002
 188 @@ -1377,8 +1377,7 @@
 189                 do {
 190                         old_handle_count = transaction->t_handle_count;
 191                         set_current_state(TASK_RUNNING);
 192 -                       current->policy |= SCHED_YIELD;
 193 -                       schedule();
 194 +                       yield();
 195                 } while (old_handle_count != transaction->t_handle_count);
 196         }
 197
 198 --- linux/fs/binfmt_elf.c.orig  Sun Jan  6 13:55:57 2002
 199 +++ linux/fs/binfmt_elf.c       Sun Jan  6 13:56:25 2002
 200 @@ -1143,7 +1143,7 @@
 201         psinfo.pr_state = i;
 202         psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i];
 203         psinfo.pr_zomb = psinfo.pr_sname == 'Z';
 204 -       psinfo.pr_nice = current->nice;
 205 +       psinfo.pr_nice = current->__nice;
 206         psinfo.pr_flag = current->flags;
 207         psinfo.pr_uid = NEW_TO_OLD_UID(current->uid);
 208         psinfo.pr_gid = NEW_TO_OLD_GID(current->gid);
 209 --- linux/fs/buffer.c.orig      Sun Jan  6 13:55:57 2002
 210 +++ linux/fs/buffer.c   Fri Jan 25 14:25:56 2002
 211 @@ -725,9 +725,8 @@
 212         wakeup_bdflush();
 213         try_to_free_pages(zone, GFP_NOFS, 0);
 214         run_task_queue(&tq_disk);
 215 -       current->policy |= SCHED_YIELD;
 216         __set_current_state(TASK_RUNNING);
 217 -       schedule();
 218 +       sys_sched_yield();
 219  }
 220
 221  void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 222 --- linux/fs/locks.c.orig       Sun Jan  6 13:55:51 2002
 223 +++ linux/fs/locks.c    Sun Jan  6 13:56:25 2002
 224 @@ -445,8 +445,7 @@
 225                         /* Let the blocked process remove waiter from the
 226                          * block list when it gets scheduled.
 227                          */
 228 -                       current->policy |= SCHED_YIELD;
 229 -                       schedule();
 230 +                       yield();
 231                 } else {
 232                         /* Remove waiter from the block list, because by the
 233                          * time it wakes up blocker won't exist any more.
 234 --- linux/init/main.c.orig      Sun Jan  6 13:55:57 2002
 235 +++ linux/init/main.c   Mon Jan 28 18:12:51 2002
 236 @@ -482,8 +482,6 @@
 237  extern void setup_arch(char **);
 238  extern void cpu_idle(void);
 239
 240 -unsigned long wait_init_idle;
 241 -
 242  #ifndef CONFIG_SMP
 243
 244  #ifdef CONFIG_X86_LOCAL_APIC
 245 @@ -492,34 +490,24 @@
 246         APIC_init_uniprocessor();
 247  }
 248  #else
 249 -#define smp_init()     do { } while (0)
 250 +#define smp_init()      do { } while (0)
 251  #endif
 252
 253  #else
 254
 255 -
 256  /* Called by boot processor to activate the rest. */
 257  static void __init smp_init(void)
 258  {
 259         /* Get other processors into their bootup holding patterns. */
 260         smp_boot_cpus();
 261 -       wait_init_idle = cpu_online_map;
 262 -       clear_bit(current->processor, &wait_init_idle); /* Don't wait on me! */
 263
 264         smp_threads_ready=1;
 265         smp_commence();
 266 -
 267 -       /* Wait for the other cpus to set up their idle processes */
 268 -       printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle);
 269 -       while (wait_init_idle) {
 270 -               cpu_relax();
 271 -               barrier();
 272 -       }
 273 -       printk("All processors have done init_idle\n");
 274  }
 275
 276  #endif
 277
 278 +
 279  /*
 280   * We need to finalize in a non-__init function or else race conditions
 281   * between the root thread and the init thread may cause start_kernel to
 282 @@ -531,9 +519,8 @@
 283  {
 284         kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
 285         unlock_kernel();
 286 -       current->need_resched = 1;
 287 -       cpu_idle();
 288 -}
 289 +       cpu_idle();
 290 +}
 291
 292  /*
 293   *     Activate the first processor.
 294 @@ -611,14 +598,18 @@
 295         ipc_init();
 296  #endif
 297         check_bugs();
 298 +
 299         printk("POSIX conformance testing by UNIFIX\n");
 300
 301 -       /*
 302 -        *      We count on the initial thread going ok
 303 -        *      Like idlers init is an unlocked kernel thread, which will
 304 -        *      make syscalls (and thus be locked).
 305 +       init_idle(current, smp_processor_id());
 306 +       /*
 307 +        *      We count on the initial thread going ok
 308 +        *      Like idlers init is an unlocked kernel thread, which will
 309 +        *      make syscalls (and thus be locked).
 310          */
 311         smp_init();
 312 +
 313 +       /* Do the rest non-__init'ed, we're now alive */
 314         rest_init();
 315  }
 316
 317 @@ -779,12 +770,9 @@
 318                 int i, pid;
 319
 320                 pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD);
 321 -               if (pid > 0) {
 322 -                       while (pid != wait(&i)) {
 323 -                               current->policy |= SCHED_YIELD;
 324 -                               schedule();
 325 -                       }
 326 -               }
 327 +               if (pid > 0)
 328 +                       while (pid != wait(&i))
 329 +                               yield();
 330                 if (MAJOR(real_root_dev) != RAMDISK_MAJOR
 331                      || MINOR(real_root_dev) != 0) {
 332                         error = change_root(real_root_dev,"/initrd");
 333 --- linux/kernel/sched.c.orig   Sun Jan  6 13:55:57 2002
 334 +++ linux/kernel/sched.c        Mon Jan 28 18:41:54 2002
 335 @@ -12,333 +12,249 @@
 336   *  1998-12-28  Implemented better SMP scheduling by Ingo Molnar
 337   */
 338
 339 -/*
 340 - * 'sched.c' is the main kernel file. It contains scheduling primitives
 341 - * (sleep_on, wakeup, schedule etc) as well as a number of simple system
 342 - * call functions (type getpid()), which just extract a field from
 343 - * current-task
 344 - */
 345 -
 346 -#include <linux/config.h>
 347  #include <linux/mm.h>
 348 +#include <linux/nmi.h>
 349  #include <linux/init.h>
 350 +#include <asm/uaccess.h>
 351  #include <linux/smp_lock.h>
 352 -#include <linux/nmi.h>
 353  #include <linux/interrupt.h>
 354 -#include <linux/kernel_stat.h>
 355 -#include <linux/completion.h>
 356 -#include <linux/prefetch.h>
 357 -#include <linux/compiler.h>
 358 -
 359 -#include <asm/uaccess.h>
 360  #include <asm/mmu_context.h>
 361
 362 -extern void timer_bh(void);
 363 -extern void tqueue_bh(void);
 364 -extern void immediate_bh(void);
 365 -
 366 -/*
 367 - * scheduler variables
 368 - */
 369 +#define BITMAP_SIZE ((((MAX_PRIO+7)/8)+sizeof(long)-1)/sizeof(long))
 370
 371 -unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
 372 +typedef struct runqueue runqueue_t;
 373
 374 -extern void mem_use(void);
 375 +struct prio_array {
 376 +       int nr_active;
 377 +       spinlock_t *lock;
 378 +       runqueue_t *rq;
 379 +       unsigned long bitmap[BITMAP_SIZE];
 380 +       list_t queue[MAX_PRIO];
 381 +};
 382
 383  /*
 384 - * Scheduling quanta.
 385 + * This is the main, per-CPU runqueue data structure.
 386   *
 387 - * NOTE! The unix "nice" value influences how long a process
 388 - * gets. The nice value ranges from -20 to +19, where a -20
 389 - * is a "high-priority" task, and a "+10" is a low-priority
 390 - * task.
 391 - *
 392 - * We want the time-slice to be around 50ms or so, so this
 393 - * calculation depends on the value of HZ.
 394 + * Locking rule: those places that want to lock multiple runqueues
 395 + * (such as the load balancing or the process migration code), lock
 396 + * acquire operations must be ordered by ascending &runqueue.
 397   */
 398 -#if HZ < 200
 399 -#define TICK_SCALE(x)  ((x) >> 2)
 400 -#elif HZ < 400
 401 -#define TICK_SCALE(x)  ((x) >> 1)
 402 -#elif HZ < 800
 403 -#define TICK_SCALE(x)  (x)
 404 -#elif HZ < 1600
 405 -#define TICK_SCALE(x)  ((x) << 1)
 406 -#else
 407 -#define TICK_SCALE(x)  ((x) << 2)
 408 -#endif
 409 +struct runqueue {
 410 +       spinlock_t lock;
 411 +       unsigned long nr_running, nr_switches, expired_timestamp;
 412 +       task_t *curr, *idle;
 413 +       prio_array_t *active, *expired, arrays[2];
 414 +       int prev_nr_running[NR_CPUS];
 415 +} ____cacheline_aligned;
 416
 417 -#define NICE_TO_TICKS(nice)    (TICK_SCALE(20-(nice))+1)
 418 +static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
 419
 420 +#define cpu_rq(cpu)            (runqueues + (cpu))
 421 +#define this_rq()              cpu_rq(smp_processor_id())
 422 +#define task_rq(p)             cpu_rq((p)->cpu)
 423 +#define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
 424 +#define rt_task(p)             ((p)->policy != SCHED_OTHER)
 425 +
 426 +
 427 +static inline runqueue_t *lock_task_rq(task_t *p, unsigned long *flags)
 428 +{
 429 +       struct runqueue *__rq;
 430 +
 431 +repeat_lock_task:
 432 +       __rq = task_rq(p);
 433 +       spin_lock_irqsave(&__rq->lock, *flags);
 434 +       if (unlikely(__rq != task_rq(p))) {
 435 +               spin_unlock_irqrestore(&__rq->lock, *flags);
 436 +               goto repeat_lock_task;
 437 +       }
 438 +       return __rq;
 439 +}
 440
 441 +static inline void unlock_task_rq(runqueue_t *rq, unsigned long *flags)
 442 +{
 443 +       spin_unlock_irqrestore(&rq->lock, *flags);
 444 +}
 445  /*
 446 - *     Init task must be ok at boot for the ix86 as we will check its signals
 447 - *     via the SMP irq return path.
 448 + * Adding/removing a task to/from a priority array:
 449   */
 450 -
 451 -struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
 452 +static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
 453 +{
 454 +       array->nr_active--;
 455 +       list_del_init(&p->run_list);
 456 +       if (list_empty(array->queue + p->prio))
 457 +               __clear_bit(p->prio, array->bitmap);
 458 +}
 459 +
 460 +static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
 461 +{
 462 +       list_add_tail(&p->run_list, array->queue + p->prio);
 463 +       __set_bit(p->prio, array->bitmap);
 464 +       array->nr_active++;
 465 +       p->array = array;
 466 +}
 467
 468  /*
 469 - * The tasklist_lock protects the linked list of processes.
 470 + * A task is 'heavily interactive' if it either has reached the
 471 + * bottom 25% of the SCHED_OTHER priority range, or if it is below
 472 + * its default priority by at least 3 priority levels. In this
 473 + * case we favor it by reinserting it on the active array,
 474 + * even after it expired its current timeslice.
 475   *
 476 - * The runqueue_lock locks the parts that actually access
 477 - * and change the run-queues, and have to be interrupt-safe.
 478 + * A task is a 'CPU hog' if it's either in the upper 25% of the
 479 + * SCHED_OTHER priority range, or if's not an interactive task.
 480   *
 481 - * If both locks are to be concurrently held, the runqueue_lock
 482 - * nests inside the tasklist_lock.
 483 + * A task can get a priority bonus by being 'somewhat
 484 + * interactive' - and it will get a priority penalty for
 485 + * being a CPU hog.
 486   *
 487 - * task->alloc_lock nests inside tasklist_lock.
 488 - */
 489 -spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* inner */
 490 -rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */
 491 -
 492 -static LIST_HEAD(runqueue_head);
 493 -
 494 -/*
 495 - * We align per-CPU scheduling data on cacheline boundaries,
 496 - * to prevent cacheline ping-pong.
 497   */
 498 -static union {
 499 -       struct schedule_data {
 500 -               struct task_struct * curr;
 501 -               cycles_t last_schedule;
 502 -       } schedule_data;
 503 -       char __pad [SMP_CACHE_BYTES];
 504 -} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
 505 -
 506 -#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
 507 -#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
 508 -
 509 -struct kernel_stat kstat;
 510 -extern struct task_struct *child_reaper;
 511 -
 512 -#ifdef CONFIG_SMP
 513 -
 514 -#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
 515 -#define can_schedule(p,cpu) \
 516 -       ((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu))
 517 -
 518 -#else
 519 -
 520 -#define idle_task(cpu) (&init_task)
 521 -#define can_schedule(p,cpu) (1)
 522 -
 523 -#endif
 524
 525 -void scheduling_functions_start_here(void) { }
 526 +#define PRIO_INTERACTIVE \
 527 +               (MAX_RT_PRIO + MAX_USER_PRIO*PRIO_INTERACTIVE_RATIO/100)
 528 +#define PRIO_CPU_HOG \
 529 +               (MAX_RT_PRIO + MAX_USER_PRIO*PRIO_CPU_HOG_RATIO/100)
 530 +
 531 +#define TASK_INTERACTIVE(p) \
 532 +       (((p)->prio <= PRIO_INTERACTIVE) || \
 533 +       (((p)->prio < PRIO_CPU_HOG) && \
 534 +               ((p)->prio <= NICE_TO_PRIO((p)->__nice) - INTERACTIVE_DELTA)))
 535
 536  /*
 537 - * This is the function that decides how desirable a process is..
 538 - * You can weigh different processes against each other depending
 539 - * on what CPU they've run on lately etc to try to handle cache
 540 - * and TLB miss penalties.
 541 + * We place interactive tasks back into the active array, if possible.
 542   *
 543 - * Return values:
 544 - *      -1000: never select this
 545 - *          0: out of time, recalculate counters (but it might still be
 546 - *             selected)
 547 - *        +ve: "goodness" value (the larger, the better)
 548 - *      +1000: realtime process, select this.
 549 + * To guarantee that this does not starve expired tasks we ignore the
 550 + * interactivity of a task if the first expired task had to wait more
 551 + * than a 'reasonable' amount of time. This deadline timeout is
 552 + * load-dependent, as the frequency of array switched decreases with
 553 + * increasing number of running tasks:
 554   */
 555 +#define EXPIRED_STARVING(rq) \
 556 +               ((rq)->expired_timestamp && \
 557 +               (jiffies - (rq)->expired_timestamp >= \
 558 +                       STARVATION_LIMIT * ((rq)->nr_running) + 1))
 559
 560 -static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
 561 +static inline int effective_prio(task_t *p)
 562  {
 563 -       int weight;
 564 +       int bonus, prio;
 565
 566         /*
 567 -        * select the current process after every other
 568 -        * runnable process, but before the idle thread.
 569 -        * Also, dont trigger a counter recalculation.
 570 +        * Here we scale the actual sleep average [0 .... MAX_SLEEP_AVG]
 571 +        * into the -14 ... +14 bonus/penalty range.
 572 +        *
 573 +        * We use 70% of the full 0...39 priority range so that:
 574 +        *
 575 +        * 1) nice +19 CPU hogs do not preempt nice 0 CPU hogs.
 576 +        * 2) nice -20 interactive tasks do not get preempted by
 577 +        *    nice 0 interactive tasks.
 578 +        *
 579 +        * Both properties are important to certain workloads.
 580          */
 581 -       weight = -1;
 582 -       if (p->policy & SCHED_YIELD)
 583 -               goto out;
 584 +       bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
 585 +                       MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
 586
 587 -       /*
 588 -        * Non-RT process - normal case first.
 589 -        */
 590 -       if (p->policy == SCHED_OTHER) {
 591 +       prio = NICE_TO_PRIO(p->__nice) - bonus;
 592 +       if (prio < MAX_RT_PRIO)
 593 +               prio = MAX_RT_PRIO;
 594 +       if (prio > MAX_PRIO-1)
 595 +               prio = MAX_PRIO-1;
 596 +       return prio;
 597 +}
 598 +
 599 +static inline void activate_task(task_t *p, runqueue_t *rq)
 600 +{
 601 +       unsigned long sleep_time = jiffies - p->sleep_timestamp;
 602 +       prio_array_t *array = rq->active;
 603 +
 604 +       if (!rt_task(p) && sleep_time) {
 605                 /*
 606 -                * Give the process a first-approximation goodness value
 607 -                * according to the number of clock-ticks it has left.
 608 -                *
 609 -                * Don't do any other calculations if the time slice is
 610 -                * over..
 611 +                * This code gives a bonus to interactive tasks. We update
 612 +                * an 'average sleep time' value here, based on
 613 +                * sleep_timestamp. The more time a task spends sleeping,
 614 +                * the higher the average gets - and the higher the priority
 615 +                * boost gets as well.
 616                  */
 617 -               weight = p->counter;
 618 -               if (!weight)
 619 -                       goto out;
 620 -
 621 -#ifdef CONFIG_SMP
 622 -               /* Give a largish advantage to the same processor...   */
 623 -               /* (this is equivalent to penalizing other processors) */
 624 -               if (p->processor == this_cpu)
 625 -                       weight += PROC_CHANGE_PENALTY;
 626 -#endif
 627 -
 628 -               /* .. and a slight advantage to the current MM */
 629 -               if (p->mm == this_mm || !p->mm)
 630 -                       weight += 1;
 631 -               weight += 20 - p->nice;
 632 -               goto out;
 633 +               p->sleep_avg += sleep_time;
 634 +               if (p->sleep_avg > MAX_SLEEP_AVG)
 635 +                       p->sleep_avg = MAX_SLEEP_AVG;
 636 +               p->prio = effective_prio(p);
 637         }
 638 +       enqueue_task(p, array);
 639 +       rq->nr_running++;
 640 +}
 641
 642 -       /*
 643 -        * Realtime process, select the first one on the
 644 -        * runqueue (taking priorities within processes
 645 -        * into account).
 646 -        */
 647 -       weight = 1000 + p->rt_priority;
 648 -out:
 649 -       return weight;
 650 +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
 651 +{
 652 +       rq->nr_running--;
 653 +       dequeue_task(p, p->array);
 654 +       p->array = NULL;
 655 +       p->sleep_timestamp = jiffies;
 656  }
 657
 658 -/*
 659 - * the 'goodness value' of replacing a process on a given CPU.
 660 - * positive value means 'replace', zero or negative means 'dont'.
 661 - */
 662 -static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
 663 +static inline void resched_task(task_t *p)
 664  {
 665 -       return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
 666 +       int need_resched;
 667 +
 668 +       need_resched = p->need_resched;
 669 +       wmb();
 670 +       p->need_resched = 1;
 671 +       if (!need_resched && (p->cpu != smp_processor_id()))
 672 +               smp_send_reschedule(p->cpu);
 673  }
 674
 675 +#ifdef CONFIG_SMP
 676 +
 677  /*
 678 - * This is ugly, but reschedule_idle() is very timing-critical.
 679 - * We are called with the runqueue spinlock held and we must
 680 - * not claim the tasklist_lock.
 681 + * Wait for a process to unschedule. This is used by the exit() and
 682 + * ptrace() code.
 683   */
 684 -static FASTCALL(void reschedule_idle(struct task_struct * p));
 685 -
 686 -static void reschedule_idle(struct task_struct * p)
 687 +void wait_task_inactive(task_t * p)
 688  {
 689 -#ifdef CONFIG_SMP
 690 -       int this_cpu = smp_processor_id();
 691 -       struct task_struct *tsk, *target_tsk;
 692 -       int cpu, best_cpu, i, max_prio;
 693 -       cycles_t oldest_idle;
 694 +       unsigned long flags;
 695 +       runqueue_t *rq;
 696
 697 -       /*
 698 -        * shortcut if the woken up task's last CPU is
 699 -        * idle now.
 700 -        */
 701 -       best_cpu = p->processor;
 702 -       if (can_schedule(p, best_cpu)) {
 703 -               tsk = idle_task(best_cpu);
 704 -               if (cpu_curr(best_cpu) == tsk) {
 705 -                       int need_resched;
 706 -send_now_idle:
 707 -                       /*
 708 -                        * If need_resched == -1 then we can skip sending
 709 -                        * the IPI altogether, tsk->need_resched is
 710 -                        * actively watched by the idle thread.
 711 -                        */
 712 -                       need_resched = tsk->need_resched;
 713 -                       tsk->need_resched = 1;
 714 -                       if ((best_cpu != this_cpu) && !need_resched)
 715 -                               smp_send_reschedule(best_cpu);
 716 -                       return;
 717 -               }
 718 +repeat:
 719 +       rq = task_rq(p);
 720 +       while (unlikely(rq->curr == p)) {
 721 +               cpu_relax();
 722 +               barrier();
 723 +       }
 724 +       rq = lock_task_rq(p, &flags);
 725 +       if (unlikely(rq->curr == p)) {
 726 +               unlock_task_rq(rq, &flags);
 727 +               goto repeat;
 728         }
 729 -
 730 -       /*
 731 -        * We know that the preferred CPU has a cache-affine current
 732 -        * process, lets try to find a new idle CPU for the woken-up
 733 -        * process. Select the least recently active idle CPU. (that
 734 -        * one will have the least active cache context.) Also find
 735 -        * the executing process which has the least priority.
 736 -        */
 737 -       oldest_idle = (cycles_t) -1;
 738 -       target_tsk = NULL;
 739 -       max_prio = 0;
 740 -
 741 -       for (i = 0; i < smp_num_cpus; i++) {
 742 -               cpu = cpu_logical_map(i);
 743 -               if (!can_schedule(p, cpu))
 744 -                       continue;
 745 -               tsk = cpu_curr(cpu);
 746 -               /*
 747 -                * We use the first available idle CPU. This creates
 748 -                * a priority list between idle CPUs, but this is not
 749 -                * a problem.
 750 -                */
 751 -               if (tsk == idle_task(cpu)) {
 752 -#if defined(__i386__) && defined(CONFIG_SMP)
 753 -                        /*
 754 -                        * Check if two siblings are idle in the same
 755 -                        * physical package. Use them if found.
 756 -                        */
 757 -                       if (smp_num_siblings == 2) {
 758 -                               if (cpu_curr(cpu_sibling_map[cpu]) ==
 759 -                                   idle_task(cpu_sibling_map[cpu])) {
 760 -                                       oldest_idle = last_schedule(cpu);
 761 -                                       target_tsk = tsk;
 762 -                                       break;
 763 -                               }
 764 -
 765 -                        }
 766 -#endif
 767 -                       if (last_schedule(cpu) < oldest_idle) {
 768 -                               oldest_idle = last_schedule(cpu);
 769 -                               target_tsk = tsk;
 770 -                       }
 771 -               } else {
 772 -                       if (oldest_idle == -1ULL) {
 773 -                               int prio = preemption_goodness(tsk, p, cpu);
 774 -
 775 -                               if (prio > max_prio) {
 776 -                                       max_prio = prio;
 777 -                                       target_tsk = tsk;
 778 -                               }
 779 -                       }
 780 -               }
 781 -       }
 782 -       tsk = target_tsk;
 783 -       if (tsk) {
 784 -               if (oldest_idle != -1ULL) {
 785 -                       best_cpu = tsk->processor;
 786 -                       goto send_now_idle;
 787 -               }
 788 -               tsk->need_resched = 1;
 789 -               if (tsk->processor != this_cpu)
 790 -                       smp_send_reschedule(tsk->processor);
 791 -       }
 792 -       return;
 793 -
 794 -
 795 -#else /* UP */
 796 -       int this_cpu = smp_processor_id();
 797 -       struct task_struct *tsk;
 798 -
 799 -       tsk = cpu_curr(this_cpu);
 800 -       if (preemption_goodness(tsk, p, this_cpu) > 0)
 801 -               tsk->need_resched = 1;
 802 -#endif
 803 +       unlock_task_rq(rq, &flags);
 804  }
 805
 806  /*
 807 - * Careful!
 808 + * The SMP message passing code calls this function whenever
 809 + * the new task has arrived at the target CPU. We move the
 810 + * new task into the local runqueue.
 811   *
 812 - * This has to add the process to the _beginning_ of the
 813 - * run-queue, not the end. See the comment about "This is
 814 - * subtle" in the scheduler proper..
 815 + * This function must be called with interrupts disabled.
 816   */
 817 -static inline void add_to_runqueue(struct task_struct * p)
 818 +void sched_task_migrated(task_t *new_task)
 819  {
 820 -       list_add(&p->run_list, &runqueue_head);
 821 -       nr_running++;
 822 +       wait_task_inactive(new_task);
 823 +       new_task->cpu = smp_processor_id();
 824 +       wake_up_process(new_task);
 825  }
 826
 827 -static inline void move_last_runqueue(struct task_struct * p)
 828 -{
 829 -       list_del(&p->run_list);
 830 -       list_add_tail(&p->run_list, &runqueue_head);
 831 -}
 832 -
 833 -static inline void move_first_runqueue(struct task_struct * p)
 834 +/*
 835 + * Kick the remote CPU if the task is running currently,
 836 + * this code is used by the signal code to signal tasks
 837 + * which are in user-mode as quickly as possible.
 838 + *
 839 + * (Note that we do this lockless - if the task does anything
 840 + * while the message is in flight then it will notice the
 841 + * sigpending condition anyway.)
 842 + */
 843 +void kick_if_running(task_t * p)
 844  {
 845 -       list_del(&p->run_list);
 846 -       list_add(&p->run_list, &runqueue_head);
 847 +       if (p == task_rq(p)->curr)
 848 +               resched_task(p);
 849  }
 850 +#endif
 851
 852  /*
 853   * Wake up a process. Put it on the run-queue if it's not
 854 @@ -348,392 +264,472 @@
 855   * "current->state = TASK_RUNNING" to mark yourself runnable
 856   * without the overhead of this.
 857   */
 858 -static inline int try_to_wake_up(struct task_struct * p, int synchronous)
 859 +static int try_to_wake_up(task_t * p, int synchronous)
 860  {
 861         unsigned long flags;
 862         int success = 0;
 863 +       runqueue_t *rq;
 864
 865 -       /*
 866 -        * We want the common case fall through straight, thus the goto.
 867 -        */
 868 -       spin_lock_irqsave(&runqueue_lock, flags);
 869 +       rq = lock_task_rq(p, &flags);
 870         p->state = TASK_RUNNING;
 871 -       if (task_on_runqueue(p))
 872 -               goto out;
 873 -       add_to_runqueue(p);
 874 -       if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
 875 -               reschedule_idle(p);
 876 -       success = 1;
 877 -out:
 878 -       spin_unlock_irqrestore(&runqueue_lock, flags);
 879 +       if (!p->array) {
 880 +               activate_task(p, rq);
 881 +               if ((rq->curr == rq->idle) || (p->prio < rq->curr->prio))
 882 +                       resched_task(rq->curr);
 883 +               success = 1;
 884 +       }
 885 +       unlock_task_rq(rq, &flags);
 886         return success;
 887  }
 888
 889 -inline int wake_up_process(struct task_struct * p)
 890 +int wake_up_process(task_t * p)
 891  {
 892         return try_to_wake_up(p, 0);
 893  }
 894
 895 -static void process_timeout(unsigned long __data)
 896 +void wake_up_forked_process(task_t * p)
 897  {
 898 -       struct task_struct * p = (struct task_struct *) __data;
 899 +       runqueue_t *rq = this_rq();
 900
 901 -       wake_up_process(p);
 902 +       p->state = TASK_RUNNING;
 903 +       if (!rt_task(p)) {
 904 +               current->sleep_avg = current->sleep_avg * PARENT_FORK_PENALTY / 100;
 905 +               p->sleep_avg = p->sleep_avg * CHILD_FORK_PENALTY / 100;
 906 +               p->prio = effective_prio(p);
 907 +       }
 908 +       spin_lock_irq(&rq->lock);
 909 +       p->cpu = smp_processor_id();
 910 +       activate_task(p, rq);
 911 +       spin_unlock_irq(&rq->lock);
 912  }
 913
 914 -/**
 915 - * schedule_timeout - sleep until timeout
 916 - * @timeout: timeout value in jiffies
 917 - *
 918 - * Make the current task sleep until @timeout jiffies have
 919 - * elapsed. The routine will return immediately unless
 920 - * the current task state has been set (see set_current_state()).
 921 - *
 922 - * You can set the task state as follows -
 923 - *
 924 - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
 925 - * pass before the routine returns. The routine will return 0
 926 - *
 927 - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 928 - * delivered to the current task. In this case the remaining time
 929 - * in jiffies will be returned, or 0 if the timer expired in time
 930 - *
 931 - * The current task state is guaranteed to be TASK_RUNNING when this
 932 - * routine returns.
 933 - *
 934 - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
 935 - * the CPU away without a bound on the timeout. In this case the return
 936 - * value will be %MAX_SCHEDULE_TIMEOUT.
 937 - *
 938 - * In all cases the return value is guaranteed to be non-negative.
 939 - */
 940 -signed long schedule_timeout(signed long timeout)
 941 +asmlinkage void schedule_tail(task_t *prev)
 942  {
 943 -       struct timer_list timer;
 944 -       unsigned long expire;
 945 -
 946 -       switch (timeout)
 947 -       {
 948 -       case MAX_SCHEDULE_TIMEOUT:
 949 -               /*
 950 -                * These two special cases are useful to be comfortable
 951 -                * in the caller. Nothing more. We could take
 952 -                * MAX_SCHEDULE_TIMEOUT from one of the negative value
 953 -                * but I' d like to return a valid offset (>=0) to allow
 954 -                * the caller to do everything it want with the retval.
 955 -                */
 956 -               schedule();
 957 -               goto out;
 958 -       default:
 959 -               /*
 960 -                * Another bit of PARANOID. Note that the retval will be
 961 -                * 0 since no piece of kernel is supposed to do a check
 962 -                * for a negative retval of schedule_timeout() (since it
 963 -                * should never happens anyway). You just have the printk()
 964 -                * that will tell you if something is gone wrong and where.
 965 -                */
 966 -               if (timeout < 0)
 967 -               {
 968 -                       printk(KERN_ERR "schedule_timeout: wrong timeout "
 969 -                              "value %lx from %p\n", timeout,
 970 -                              __builtin_return_address(0));
 971 -                       current->state = TASK_RUNNING;
 972 -                       goto out;
 973 -               }
 974 -       }
 975 +       spin_unlock_irq(&this_rq()->lock);
 976 +}
 977
 978 -       expire = timeout + jiffies;
 979 +static inline void context_switch(task_t *prev, task_t *next)
 980 +{
 981 +       struct mm_struct *mm = next->mm;
 982 +       struct mm_struct *oldmm = prev->active_mm;
 983
 984 -       init_timer(&timer);
 985 -       timer.expires = expire;
 986 -       timer.data = (unsigned long) current;
 987 -       timer.function = process_timeout;
 988 +       prepare_to_switch();
 989
 990 -       add_timer(&timer);
 991 -       schedule();
 992 -       del_timer_sync(&timer);
 993 +       if (unlikely(!mm)) {
 994 +               next->active_mm = oldmm;
 995 +               atomic_inc(&oldmm->mm_count);
 996 +               enter_lazy_tlb(oldmm, next, smp_processor_id());
 997 +       } else
 998 +               switch_mm(oldmm, mm, next, smp_processor_id());
 999
1000 -       timeout = expire - jiffies;
1001 +       if (unlikely(!prev->mm)) {
1002 +               prev->active_mm = NULL;
1003 +               mmdrop(oldmm);
1004 +       }
1005
1006 - out:
1007 -       return timeout < 0 ? 0 : timeout;
1008 +       /*
1009 +        * Here we just switch the register state and the stack. There are
1010 +        * 3 processes affected by a context switch:
1011 +        *
1012 +        * prev ==> .... ==> (last => next)
1013 +        *
1014 +        * It's the 'much more previous' 'prev' that is on next's stack,
1015 +        * but prev is set to (the just run) 'last' process by switch_to().
1016 +        * This might sound slightly confusing but makes tons of sense.
1017 +        */
1018 +       switch_to(prev, next, prev);
1019  }
1020
1021 -/*
1022 - * schedule_tail() is getting called from the fork return path. This
1023 - * cleans up all remaining scheduler things, without impacting the
1024 - * common case.
1025 - */
1026 -static inline void __schedule_tail(struct task_struct *prev)
1027 +unsigned long nr_running(void)
1028  {
1029 -#ifdef CONFIG_SMP
1030 -       int policy;
1031 -
1032 -       /*
1033 -        * prev->policy can be written from here only before `prev'
1034 -        * can be scheduled (before setting prev->cpus_runnable to ~0UL).
1035 -        * Of course it must also be read before allowing prev
1036 -        * to be rescheduled, but since the write depends on the read
1037 -        * to complete, wmb() is enough. (the spin_lock() acquired
1038 -        * before setting cpus_runnable is not enough because the spin_lock()
1039 -        * common code semantics allows code outside the critical section
1040 -        * to enter inside the critical section)
1041 -        */
1042 -       policy = prev->policy;
1043 -       prev->policy = policy & ~SCHED_YIELD;
1044 -       wmb();
1045 +       unsigned long i, sum = 0;
1046
1047 -       /*
1048 -        * fast path falls through. We have to clear cpus_runnable before
1049 -        * checking prev->state to avoid a wakeup race. Protect against
1050 -        * the task exiting early.
1051 -        */
1052 -       task_lock(prev);
1053 -       task_release_cpu(prev);
1054 -       mb();
1055 -       if (prev->state == TASK_RUNNING)
1056 -               goto needs_resched;
1057 +       for (i = 0; i < smp_num_cpus; i++)
1058 +               sum += cpu_rq(cpu_logical_map(i))->nr_running;
1059
1060 -out_unlock:
1061 -       task_unlock(prev);      /* Synchronise here with release_task() if prev is TASK_ZOMBIE */
1062 -       return;
1063 +       return sum;
1064 +}
1065
1066 -       /*
1067 -        * Slow path - we 'push' the previous process and
1068 -        * reschedule_idle() will attempt to find a new
1069 -        * processor for it. (but it might preempt the
1070 -        * current process as well.) We must take the runqueue
1071 -        * lock and re-check prev->state to be correct. It might
1072 -        * still happen that this process has a preemption
1073 -        * 'in progress' already - but this is not a problem and
1074 -        * might happen in other circumstances as well.
1075 -        */
1076 -needs_resched:
1077 -       {
1078 -               unsigned long flags;
1079 +unsigned long nr_context_switches(void)
1080 +{
1081 +       unsigned long i, sum = 0;
1082
1083 -               /*
1084 -                * Avoid taking the runqueue lock in cases where
1085 -                * no preemption-check is necessery:
1086 -                */
1087 -               if ((prev == idle_task(smp_processor_id())) ||
1088 -                                               (policy & SCHED_YIELD))
1089 -                       goto out_unlock;
1090 +       for (i = 0; i < smp_num_cpus; i++)
1091 +               sum += cpu_rq(cpu_logical_map(i))->nr_switches;
1092
1093 -               spin_lock_irqsave(&runqueue_lock, flags);
1094 -               if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev))
1095 -                       reschedule_idle(prev);
1096 -               spin_unlock_irqrestore(&runqueue_lock, flags);
1097 -               goto out_unlock;
1098 -       }
1099 -#else
1100 -       prev->policy &= ~SCHED_YIELD;
1101 -#endif /* CONFIG_SMP */
1102 +       return sum;
1103  }
1104
1105 -asmlinkage void schedule_tail(struct task_struct *prev)
1106 +#if CONFIG_SMP
1107 +/*
1108 + * Lock the busiest runqueue as well, this_rq is locked already.
1109 + * Recalculate nr_running if we have to drop the runqueue lock.
1110 + */
1111 +static inline unsigned int double_lock_balance(runqueue_t *this_rq,
1112 +       runqueue_t *busiest, int this_cpu, int idle, unsigned int nr_running)
1113  {
1114 -       __schedule_tail(prev);
1115 +       if (unlikely(!spin_trylock(&busiest->lock))) {
1116 +               if (busiest < this_rq) {
1117 +                       spin_unlock(&this_rq->lock);
1118 +                       spin_lock(&busiest->lock);
1119 +                       spin_lock(&this_rq->lock);
1120 +                       /* Need to recalculate nr_running */
1121 +                       if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
1122 +                               nr_running = this_rq->nr_running;
1123 +                       else
1124 +                               nr_running = this_rq->prev_nr_running[this_cpu];
1125 +               } else
1126 +                       spin_lock(&busiest->lock);
1127 +       }
1128 +       return nr_running;
1129  }
1130
1131  /*
1132 - *  'schedule()' is the scheduler function. It's a very simple and nice
1133 - * scheduler: it's not perfect, but certainly works for most things.
1134 - *
1135 - * The goto is "interesting".
1136 + * Current runqueue is empty, or rebalance tick: if there is an
1137 + * inbalance (current runqueue is too short) then pull from
1138 + * busiest runqueue(s).
1139   *
1140 - *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
1141 - * tasks can run. It can not be killed, and it cannot sleep. The 'state'
1142 - * information in task[0] is never used.
1143 + * We call this with the current runqueue locked,
1144 + * irqs disabled.
1145   */
1146 -asmlinkage void schedule(void)
1147 +static void load_balance(runqueue_t *this_rq, int idle)
1148  {
1149 -       struct schedule_data * sched_data;
1150 -       struct task_struct *prev, *next, *p;
1151 -       struct list_head *tmp;
1152 -       int this_cpu, c;
1153 +       int imbalance, nr_running, load, max_load,
1154 +               idx, i, this_cpu = smp_processor_id();
1155 +       task_t *next = this_rq->idle, *tmp;
1156 +       runqueue_t *busiest, *rq_src;
1157 +       prio_array_t *array;
1158 +       list_t *head, *curr;
1159
1160 +       /*
1161 +        * We search all runqueues to find the most busy one.
1162 +        * We do this lockless to reduce cache-bouncing overhead,
1163 +        * we re-check the 'best' source CPU later on again, with
1164 +        * the lock held.
1165 +        *
1166 +        * We fend off statistical fluctuations in runqueue lengths by
1167 +        * saving the runqueue length during the previous load-balancing
1168 +        * operation and using the smaller one the current and saved lengths.
1169 +        * If a runqueue is long enough for a longer amount of time then
1170 +        * we recognize it and pull tasks from it.
1171 +        *
1172 +        * The 'current runqueue length' is a statistical maximum variable,
1173 +        * for that one we take the longer one - to avoid fluctuations in
1174 +        * the other direction. So for a load-balance to happen it needs
1175 +        * stable long runqueue on the target CPU and stable short runqueue
1176 +        * on the local runqueue.
1177 +        *
1178 +        * We make an exception if this CPU is about to become idle - in
1179 +        * that case we are less picky about moving a task across CPUs and
1180 +        * take what can be taken.
1181 +        */
1182 +       if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
1183 +               nr_running = this_rq->nr_running;
1184 +       else
1185 +               nr_running = this_rq->prev_nr_running[this_cpu];
1186
1187 -       spin_lock_prefetch(&runqueue_lock);
1188 +       busiest = NULL;
1189 +       max_load = 1;
1190 +       for (i = 0; i < smp_num_cpus; i++) {
1191 +               rq_src = cpu_rq(cpu_logical_map(i));
1192 +               if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i]))
1193 +                       load = rq_src->nr_running;
1194 +               else
1195 +                       load = this_rq->prev_nr_running[i];
1196 +               this_rq->prev_nr_running[i] = rq_src->nr_running;
1197 +
1198 +               if ((load > max_load) && (rq_src != this_rq)) {
1199 +                       busiest = rq_src;
1200 +                       max_load = load;
1201 +               }
1202 +       }
1203
1204 -       if (!current->active_mm) BUG();
1205 -need_resched_back:
1206 -       prev = current;
1207 -       this_cpu = prev->processor;
1208 +       if (likely(!busiest))
1209 +               return;
1210
1211 -       if (unlikely(in_interrupt())) {
1212 -               printk("Scheduling in interrupt\n");
1213 -               BUG();
1214 -       }
1215 +       imbalance = (max_load - nr_running) / 2;
1216
1217 -       release_kernel_lock(prev, this_cpu);
1218 +       /* It needs an at least ~25% imbalance to trigger balancing. */
1219 +       if (!idle && (imbalance < (max_load + 3)/4))
1220 +               return;
1221
1222 +       nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running);
1223         /*
1224 -        * 'sched_data' is protected by the fact that we can run
1225 -        * only one process per CPU.
1226 +        * Make sure nothing changed since we checked the
1227 +        * runqueue length.
1228          */
1229 -       sched_data = & aligned_data[this_cpu].schedule_data;
1230 -
1231 -       spin_lock_irq(&runqueue_lock);
1232 -
1233 -       /* move an exhausted RR process to be last.. */
1234 -       if (unlikely(prev->policy == SCHED_RR))
1235 -               if (!prev->counter) {
1236 -                       prev->counter = NICE_TO_TICKS(prev->nice);
1237 -                       move_last_runqueue(prev);
1238 -               }
1239 -
1240 -       switch (prev->state) {
1241 -               case TASK_INTERRUPTIBLE:
1242 -                       if (signal_pending(prev)) {
1243 -                               prev->state = TASK_RUNNING;
1244 -                               break;
1245 -                       }
1246 -               default:
1247 -                       del_from_runqueue(prev);
1248 -               case TASK_RUNNING:;
1249 -       }
1250 -       prev->need_resched = 0;
1251 +       if (busiest->nr_running <= this_rq->nr_running + 1)
1252 +               goto out_unlock;
1253
1254         /*
1255 -        * this is the scheduler proper:
1256 +        * We first consider expired tasks. Those will likely not be
1257 +        * executed in the near future, and they are most likely to
1258 +        * be cache-cold, thus switching CPUs has the least effect
1259 +        * on them.
1260          */
1261 +       if (busiest->expired->nr_active)
1262 +               array = busiest->expired;
1263 +       else
1264 +               array = busiest->active;
1265
1266 -repeat_schedule:
1267 +new_array:
1268         /*
1269 -        * Default process to select..
1270 +        * Load-balancing does not affect RT tasks, so we start the
1271 +        * searching at priority 128.
1272          */
1273 -       next = idle_task(this_cpu);
1274 -       c = -1000;
1275 -       list_for_each(tmp, &runqueue_head) {
1276 -               p = list_entry(tmp, struct task_struct, run_list);
1277 -               if (can_schedule(p, this_cpu)) {
1278 -                       int weight = goodness(p, this_cpu, prev->active_mm);
1279 -                       if (weight > c)
1280 -                               c = weight, next = p;
1281 +       idx = MAX_RT_PRIO;
1282 +skip_bitmap:
1283 +       idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1284 +       if (idx == MAX_PRIO) {
1285 +               if (array == busiest->expired) {
1286 +                       array = busiest->active;
1287 +                       goto new_array;
1288                 }
1289 +               goto out_unlock;
1290         }
1291
1292 -       /* Do we need to re-calculate counters? */
1293 -       if (unlikely(!c)) {
1294 -               struct task_struct *p;
1295 -
1296 -               spin_unlock_irq(&runqueue_lock);
1297 -               read_lock(&tasklist_lock);
1298 -               for_each_task(p)
1299 -                       p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
1300 -               read_unlock(&tasklist_lock);
1301 -               spin_lock_irq(&runqueue_lock);
1302 -               goto repeat_schedule;
1303 +       head = array->queue + idx;
1304 +       curr = head->prev;
1305 +skip_queue:
1306 +       tmp = list_entry(curr, task_t, run_list);
1307 +
1308 +       /*
1309 +        * We do not migrate tasks that are:
1310 +        * 1) running (obviously), or
1311 +        * 2) cannot be migrated to this CPU due to cpus_allowed, or
1312 +        * 3) are cache-hot on their current CPU.
1313 +        */
1314 +
1315 +#define CAN_MIGRATE_TASK(p,rq,this_cpu)                                        \
1316 +       ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) &&        \
1317 +               ((p) != (rq)->curr) &&                                  \
1318 +                       (tmp->cpus_allowed & (1 << (this_cpu))))
1319 +
1320 +       if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
1321 +               curr = curr->next;
1322 +               if (curr != head)
1323 +                       goto skip_queue;
1324 +               idx++;
1325 +               goto skip_bitmap;
1326 +       }
1327 +       next = tmp;
1328 +       /*
1329 +        * take the task out of the other runqueue and
1330 +        * put it into this one:
1331 +        */
1332 +       dequeue_task(next, array);
1333 +       busiest->nr_running--;
1334 +       next->cpu = this_cpu;
1335 +       this_rq->nr_running++;
1336 +       enqueue_task(next, this_rq->active);
1337 +       if (next->prio < current->prio)
1338 +               current->need_resched = 1;
1339 +       if (!idle && --imbalance) {
1340 +               if (array == busiest->expired) {
1341 +                       array = busiest->active;
1342 +                       goto new_array;
1343 +               }
1344         }
1345 +out_unlock:
1346 +       spin_unlock(&busiest->lock);
1347 +}
1348 +
1349 +/*
1350 + * One of the idle_cpu_tick() or the busy_cpu_tick() function will
1351 + * gets called every timer tick, on every CPU. Our balancing action
1352 + * frequency and balancing agressivity depends on whether the CPU is
1353 + * idle or not.
1354 + *
1355 + * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on
1356 + * systems with HZ=100, every 10 msecs.)
1357 + */
1358 +#define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
1359 +#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
1360 +
1361 +static inline void idle_tick(void)
1362 +{
1363 +       if (jiffies % IDLE_REBALANCE_TICK)
1364 +               return;
1365 +       spin_lock(&this_rq()->lock);
1366 +       load_balance(this_rq(), 1);
1367 +       spin_unlock(&this_rq()->lock);
1368 +}
1369 +
1370 +#endif
1371 +
1372 +/*
1373 + * This function gets called by the timer code, with HZ frequency.
1374 + * We call it with interrupts disabled.
1375 + */
1376 +void scheduler_tick(task_t *p)
1377 +{
1378 +       runqueue_t *rq = this_rq();
1379 +#if CONFIG_SMP
1380 +       unsigned long now = jiffies;
1381
1382 +       if (p == rq->idle)
1383 +               return idle_tick();
1384 +#endif
1385 +       /* Task might have expired already, but not scheduled off yet */
1386 +       if (p->array != rq->active) {
1387 +               p->need_resched = 1;
1388 +               return;
1389 +       }
1390 +       spin_lock(&rq->lock);
1391 +       if (unlikely(rt_task(p))) {
1392 +               /*
1393 +                * RR tasks need a special form of timeslice management.
1394 +                * FIFO tasks have no timeslices.
1395 +                */
1396 +               if ((p->policy == SCHED_RR) && !--p->time_slice) {
1397 +                       p->time_slice = NICE_TO_TIMESLICE(p->__nice);
1398 +                       p->need_resched = 1;
1399 +
1400 +                       /* put it at the end of the queue: */
1401 +                       dequeue_task(p, rq->active);
1402 +                       enqueue_task(p, rq->active);
1403 +               }
1404 +               goto out;
1405 +       }
1406         /*
1407 -        * from this point on nothing can prevent us from
1408 -        * switching to the next task, save this fact in
1409 -        * sched_data.
1410 -        */
1411 -       sched_data->curr = next;
1412 -       task_set_cpu(next, this_cpu);
1413 -       spin_unlock_irq(&runqueue_lock);
1414 -
1415 -       if (unlikely(prev == next)) {
1416 -               /* We won't go through the normal tail, so do this by hand */
1417 -               prev->policy &= ~SCHED_YIELD;
1418 -               goto same_process;
1419 +        * The task was running during this tick - update the
1420 +        * time slice counter and the sleep average. Note: we
1421 +        * do not update a process's priority until it either
1422 +        * goes to sleep or uses up its timeslice. This makes
1423 +        * it possible for interactive tasks to use up their
1424 +        * timeslices at their highest priority levels.
1425 +        */
1426 +       if (p->sleep_avg)
1427 +               p->sleep_avg--;
1428 +       if (!--p->time_slice) {
1429 +               dequeue_task(p, rq->active);
1430 +               p->need_resched = 1;
1431 +               p->prio = effective_prio(p);
1432 +               p->time_slice = NICE_TO_TIMESLICE(p->__nice);
1433 +
1434 +               if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
1435 +                       if (!rq->expired_timestamp)
1436 +                               rq->expired_timestamp = jiffies;
1437 +                       enqueue_task(p, rq->expired);
1438 +               } else
1439 +                       enqueue_task(p, rq->active);
1440         }
1441 +out:
1442 +#if CONFIG_SMP
1443 +       if (!(now % BUSY_REBALANCE_TICK))
1444 +               load_balance(rq, 0);
1445 +#endif
1446 +       spin_unlock(&rq->lock);
1447 +}
1448
1449 -#ifdef CONFIG_SMP
1450 -       /*
1451 -        * maintain the per-process 'last schedule' value.
1452 -        * (this has to be recalculated even if we reschedule to
1453 -        * the same process) Currently this is only used on SMP,
1454 -        * and it's approximate, so we do not have to maintain
1455 -        * it while holding the runqueue spinlock.
1456 -        */
1457 -       sched_data->last_schedule = get_cycles();
1458 +void scheduling_functions_start_here(void) { }
1459
1460 -       /*
1461 -        * We drop the scheduler lock early (it's a global spinlock),
1462 -        * thus we have to lock the previous process from getting
1463 -        * rescheduled during switch_to().
1464 -        */
1465 +/*
1466 + * 'schedule()' is the main scheduler function.
1467 + */
1468 +asmlinkage void schedule(void)
1469 +{
1470 +       task_t *prev = current, *next;
1471 +       runqueue_t *rq = this_rq();
1472 +       prio_array_t *array;
1473 +       list_t *queue;
1474 +       int idx;
1475
1476 -#endif /* CONFIG_SMP */
1477 +       if (unlikely(in_interrupt()))
1478 +               BUG();
1479 +       release_kernel_lock(prev, smp_processor_id());
1480 +       spin_lock_irq(&rq->lock);
1481
1482 -       kstat.context_swtch++;
1483 -       /*
1484 -        * there are 3 processes which are affected by a context switch:
1485 -        *
1486 -        * prev == .... ==> (last => next)
1487 -        *
1488 -        * It's the 'much more previous' 'prev' that is on next's stack,
1489 -        * but prev is set to (the just run) 'last' process by switch_to().
1490 -        * This might sound slightly confusing but makes tons of sense.
1491 -        */
1492 -       prepare_to_switch();
1493 -       {
1494 -               struct mm_struct *mm = next->mm;
1495 -               struct mm_struct *oldmm = prev->active_mm;
1496 -               if (!mm) {
1497 -                       if (next->active_mm) BUG();
1498 -                       next->active_mm = oldmm;
1499 -                       atomic_inc(&oldmm->mm_count);
1500 -                       enter_lazy_tlb(oldmm, next, this_cpu);
1501 -               } else {
1502 -                       if (next->active_mm != mm) BUG();
1503 -                       switch_mm(oldmm, mm, next, this_cpu);
1504 +       switch (prev->state) {
1505 +       case TASK_RUNNING:
1506 +               prev->sleep_timestamp = jiffies;
1507 +               break;
1508 +       case TASK_INTERRUPTIBLE:
1509 +               if (unlikely(signal_pending(prev))) {
1510 +                       prev->state = TASK_RUNNING;
1511 +                       prev->sleep_timestamp = jiffies;
1512 +                       break;
1513                 }
1514 +       default:
1515 +               deactivate_task(prev, rq);
1516 +       }
1517 +#if CONFIG_SMP
1518 +pick_next_task:
1519 +#endif
1520 +       if (unlikely(!rq->nr_running)) {
1521 +#if CONFIG_SMP
1522 +               load_balance(rq, 1);
1523 +               if (rq->nr_running)
1524 +                       goto pick_next_task;
1525 +#endif
1526 +               next = rq->idle;
1527 +               rq->expired_timestamp = 0;
1528 +               goto switch_tasks;
1529 +       }
1530
1531 -               if (!prev->mm) {
1532 -                       prev->active_mm = NULL;
1533 -                       mmdrop(oldmm);
1534 -               }
1535 +       array = rq->active;
1536 +       if (unlikely(!array->nr_active)) {
1537 +               /*
1538 +                * Switch the active and expired arrays.
1539 +                */
1540 +               rq->active = rq->expired;
1541 +               rq->expired = array;
1542 +               array = rq->active;
1543 +               rq->expired_timestamp = 0;
1544         }
1545
1546 -       /*
1547 -        * This just switches the register state and the
1548 -        * stack.
1549 -        */
1550 -       switch_to(prev, next, prev);
1551 -       __schedule_tail(prev);
1552 +       idx = sched_find_first_bit(array->bitmap);
1553 +       queue = array->queue + idx;
1554 +       next = list_entry(queue->next, task_t, run_list);
1555 +
1556 +switch_tasks:
1557 +       prefetch(next);
1558 +       prev->need_resched = 0;
1559 +
1560 +       if (likely(prev != next)) {
1561 +               rq->nr_switches++;
1562 +               rq->curr = next;
1563 +               context_switch(prev, next);
1564 +               /*
1565 +                * The runqueue pointer might be from another CPU
1566 +                * if the new task was last running on a different
1567 +                * CPU - thus re-load it.
1568 +                */
1569 +               barrier();
1570 +               rq = this_rq();
1571 +       }
1572 +       spin_unlock_irq(&rq->lock);
1573
1574 -same_process:
1575         reacquire_kernel_lock(current);
1576 -       if (current->need_resched)
1577 -               goto need_resched_back;
1578         return;
1579  }
1580
1581  /*
1582 - * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just wake everything
1583 - * up.  If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the
1584 - * non-exclusive tasks and one exclusive task.
1585 + * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
1586 + * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
1587 + * number) then we wake all the non-exclusive tasks and one exclusive task.
1588   *
1589   * There are circumstances in which we can try to wake a task which has already
1590 - * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns zero
1591 - * in this (rare) case, and we handle it by contonuing to scan the queue.
1592 + * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
1593 + * zero in this (rare) case, and we handle it by continuing to scan the queue.
1594   */
1595  static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
1596                                      int nr_exclusive, const int sync)
1597  {
1598         struct list_head *tmp;
1599 -       struct task_struct *p;
1600 +       task_t *p;
1601
1602 -       CHECK_MAGIC_WQHEAD(q);
1603 -       WQ_CHECK_LIST_HEAD(&q->task_list);
1604 -
1605         list_for_each(tmp,&q->task_list) {
1606                 unsigned int state;
1607 -                wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
1608 +               wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
1609
1610 -               CHECK_MAGIC(curr->__magic);
1611                 p = curr->task;
1612                 state = p->state;
1613 -               if (state & mode) {
1614 -                       WQ_NOTE_WAKER(curr);
1615 -                       if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
1616 -                               break;
1617 -               }
1618 +               if ((state & mode) &&
1619 +                               try_to_wake_up(p, sync) &&
1620 +                               ((curr->flags & WQ_FLAG_EXCLUSIVE) &&
1621 +                                       !--nr_exclusive))
1622 +                       break;
1623         }
1624  }
1625
1626 @@ -850,8 +846,70 @@
1627         return timeout;
1628  }
1629
1630 +/*
1631 + * Change the current task's CPU affinity. Migrate the process to a
1632 + * proper CPU and schedule away if the current CPU is removed from
1633 + * the allowed bitmask.
1634 + */
1635 +void set_cpus_allowed(task_t *p, unsigned long new_mask)
1636 +{
1637 +       new_mask &= cpu_online_map;
1638 +       if (!new_mask)
1639 +               BUG();
1640 +
1641 +       p->cpus_allowed = new_mask;
1642 +       /*
1643 +        * Can the task run on the current CPU? If not then
1644 +        * migrate the process off to a proper CPU.
1645 +        */
1646 +       if (new_mask & (1UL << smp_processor_id()))
1647 +               return;
1648 +#if CONFIG_SMP
1649 +       current->state = TASK_UNINTERRUPTIBLE;
1650 +       smp_migrate_task(__ffs(new_mask), current);
1651 +
1652 +       schedule();
1653 +#endif
1654 +}
1655 +
1656  void scheduling_functions_end_here(void) { }
1657
1658 +void set_user_nice(task_t *p, long nice)
1659 +{
1660 +       unsigned long flags;
1661 +       prio_array_t *array;
1662 +       runqueue_t *rq;
1663 +
1664 +       if (p->__nice == nice)
1665 +               return;
1666 +       /*
1667 +        * We have to be careful, if called from sys_setpriority(),
1668 +        * the task might be in the middle of scheduling on another CPU.
1669 +        */
1670 +       rq = lock_task_rq(p, &flags);
1671 +       if (rt_task(p)) {
1672 +               p->__nice = nice;
1673 +               goto out_unlock;
1674 +       }
1675 +       array = p->array;
1676 +       if (array)
1677 +               dequeue_task(p, array);
1678 +       p->__nice = nice;
1679 +       p->prio = NICE_TO_PRIO(nice);
1680 +       if (array) {
1681 +               enqueue_task(p, array);
1682 +               /*
1683 +                * If the task is running and lowered its priority,
1684 +                * or increased its priority then reschedule its CPU:
1685 +                */
1686 +               if ((nice < p->__nice) ||
1687 +                               ((p->__nice < nice) && (p == rq->curr)))
1688 +                       resched_task(rq->curr);
1689 +       }
1690 +out_unlock:
1691 +       unlock_task_rq(rq, &flags);
1692 +}
1693 +
1694  #ifndef __alpha__
1695
1696  /*
1697 @@ -862,7 +920,7 @@
1698
1699  asmlinkage long sys_nice(int increment)
1700  {
1701 -       long newprio;
1702 +       long nice;
1703
1704         /*
1705          *      Setpriority might change our priority at the same moment.
1706 @@ -878,32 +936,30 @@
1707         if (increment > 40)
1708                 increment = 40;
1709
1710 -       newprio = current->nice + increment;
1711 -       if (newprio < -20)
1712 -               newprio = -20;
1713 -       if (newprio > 19)
1714 -               newprio = 19;
1715 -       current->nice = newprio;
1716 +       nice = current->__nice + increment;
1717 +       if (nice < -20)
1718 +               nice = -20;
1719 +       if (nice > 19)
1720 +               nice = 19;
1721 +       set_user_nice(current, nice);
1722         return 0;
1723  }
1724
1725  #endif
1726
1727 -static inline struct task_struct *find_process_by_pid(pid_t pid)
1728 +static inline task_t *find_process_by_pid(pid_t pid)
1729  {
1730 -       struct task_struct *tsk = current;
1731 -
1732 -       if (pid)
1733 -               tsk = find_task_by_pid(pid);
1734 -       return tsk;
1735 +       return pid ? find_task_by_pid(pid) : current;
1736  }
1737
1738 -static int setscheduler(pid_t pid, int policy,
1739 -                       struct sched_param *param)
1740 +static int setscheduler(pid_t pid, int policy, struct sched_param *param)
1741  {
1742         struct sched_param lp;
1743 -       struct task_struct *p;
1744 +       prio_array_t *array;
1745 +       unsigned long flags;
1746 +       runqueue_t *rq;
1747         int retval;
1748 +       task_t *p;
1749
1750         retval = -EINVAL;
1751         if (!param || pid < 0)
1752 @@ -917,14 +973,19 @@
1753          * We play safe to avoid deadlocks.
1754          */
1755         read_lock_irq(&tasklist_lock);
1756 -       spin_lock(&runqueue_lock);
1757
1758         p = find_process_by_pid(pid);
1759
1760         retval = -ESRCH;
1761         if (!p)
1762 -               goto out_unlock;
1763 -
1764 +               goto out_unlock_tasklist;
1765 +
1766 +       /*
1767 +        * To be able to change p->policy safely, the apropriate
1768 +        * runqueue lock must be held.
1769 +        */
1770 +       rq = lock_task_rq(p, &flags);
1771 +
1772         if (policy < 0)
1773                 policy = p->policy;
1774         else {
1775 @@ -945,30 +1006,36 @@
1776                 goto out_unlock;
1777
1778         retval = -EPERM;
1779 -       if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
1780 +       if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
1781             !capable(CAP_SYS_NICE))
1782                 goto out_unlock;
1783         if ((current->euid != p->euid) && (current->euid != p->uid) &&
1784             !capable(CAP_SYS_NICE))
1785                 goto out_unlock;
1786
1787 +       array = p->array;
1788 +       if (array)
1789 +               deactivate_task(p, task_rq(p));
1790         retval = 0;
1791         p->policy = policy;
1792         p->rt_priority = lp.sched_priority;
1793 -       if (task_on_runqueue(p))
1794 -               move_first_runqueue(p);
1795 -
1796 -       current->need_resched = 1;
1797 +       if (rt_task(p))
1798 +               p->prio = 99-p->rt_priority;
1799 +       else
1800 +               p->prio = NICE_TO_PRIO(p->__nice);
1801 +       if (array)
1802 +               activate_task(p, task_rq(p));
1803
1804  out_unlock:
1805 -       spin_unlock(&runqueue_lock);
1806 +       unlock_task_rq(rq, &flags);
1807 +out_unlock_tasklist:
1808         read_unlock_irq(&tasklist_lock);
1809
1810  out_nounlock:
1811         return retval;
1812  }
1813
1814 -asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
1815 +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
1816                                       struct sched_param *param)
1817  {
1818         return setscheduler(pid, policy, param);
1819 @@ -981,7 +1048,7 @@
1820
1821  asmlinkage long sys_sched_getscheduler(pid_t pid)
1822  {
1823 -       struct task_struct *p;
1824 +       task_t *p;
1825         int retval;
1826
1827         retval = -EINVAL;
1828 @@ -992,7 +1059,7 @@
1829         read_lock(&tasklist_lock);
1830         p = find_process_by_pid(pid);
1831         if (p)
1832 -               retval = p->policy & ~SCHED_YIELD;
1833 +               retval = p->policy;
1834         read_unlock(&tasklist_lock);
1835
1836  out_nounlock:
1837 @@ -1001,7 +1068,7 @@
1838
1839  asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
1840  {
1841 -       struct task_struct *p;
1842 +       task_t *p;
1843         struct sched_param lp;
1844         int retval;
1845
1846 @@ -1032,42 +1099,38 @@
1847
1848  asmlinkage long sys_sched_yield(void)
1849  {
1850 +       runqueue_t *rq = this_rq();
1851 +       prio_array_t *array;
1852 +
1853         /*
1854 -        * Trick. sched_yield() first counts the number of truly
1855 -        * 'pending' runnable processes, then returns if it's
1856 -        * only the current processes. (This test does not have
1857 -        * to be atomic.) In threaded applications this optimization
1858 -        * gets triggered quite often.
1859 +        * Decrease the yielding task's priority by one, to avoid
1860 +        * livelocks. This priority loss is temporary, it's recovered
1861 +        * once the current timeslice expires.
1862 +        *
1863 +        * If priority is already MAX_PRIO-1 then we still
1864 +        * roundrobin the task within the runlist.
1865          */
1866 -
1867 -       int nr_pending = nr_running;
1868 -
1869 -#if CONFIG_SMP
1870 -       int i;
1871 -
1872 -       // Subtract non-idle processes running on other CPUs.
1873 -       for (i = 0; i < smp_num_cpus; i++) {
1874 -               int cpu = cpu_logical_map(i);
1875 -               if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
1876 -                       nr_pending--;
1877 +       spin_lock_irq(&rq->lock);
1878 +       array = current->array;
1879 +       /*
1880 +        * If the task has reached maximum priority (or is a RT task)
1881 +        * then just requeue the task to the end of the runqueue:
1882 +        */
1883 +       if (likely(current->prio == MAX_PRIO-1 || rt_task(current))) {
1884 +               list_del(&current->run_list);
1885 +               list_add_tail(&current->run_list, array->queue + current->prio);
1886 +       } else {
1887 +               list_del(&current->run_list);
1888 +               if (list_empty(array->queue + current->prio))
1889 +                       __clear_bit(current->prio, array->bitmap);
1890 +               current->prio++;
1891 +               list_add_tail(&current->run_list, array->queue + current->prio);
1892 +               __set_bit(current->prio, array->bitmap);
1893         }
1894 -#else
1895 -       // on UP this process is on the runqueue as well
1896 -       nr_pending--;
1897 -#endif
1898 -       if (nr_pending) {
1899 -               /*
1900 -                * This process can only be rescheduled by us,
1901 -                * so this is safe without any locking.
1902 -                */
1903 -               if (current->policy == SCHED_OTHER)
1904 -                       current->policy |= SCHED_YIELD;
1905 -               current->need_resched = 1;
1906 +       spin_unlock(&rq->lock);
1907 +
1908 +       schedule();
1909
1910 -               spin_lock_irq(&runqueue_lock);
1911 -               move_last_runqueue(current);
1912 -               spin_unlock_irq(&runqueue_lock);
1913 -       }
1914         return 0;
1915  }
1916
1917 @@ -1105,7 +1168,7 @@
1918  asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
1919  {
1920         struct timespec t;
1921 -       struct task_struct *p;
1922 +       task_t *p;
1923         int retval = -EINVAL;
1924
1925         if (pid < 0)
1926 @@ -1115,8 +1178,8 @@
1927         read_lock(&tasklist_lock);
1928         p = find_process_by_pid(pid);
1929         if (p)
1930 -               jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice),
1931 -                                   &t);
1932 +               jiffies_to_timespec(p->policy & SCHED_FIFO ?
1933 +                                        0 : NICE_TO_TIMESLICE(p->__nice), &t);
1934         read_unlock(&tasklist_lock);
1935         if (p)
1936                 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
1937 @@ -1124,14 +1187,14 @@
1938         return retval;
1939  }
1940
1941 -static void show_task(struct task_struct * p)
1942 +static void show_task(task_t * p)
1943  {
1944         unsigned long free = 0;
1945         int state;
1946         static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
1947
1948         printk("%-13.13s ", p->comm);
1949 -       state = p->state ? ffz(~p->state) + 1 : 0;
1950 +       state = p->state ? __ffs(p->state) + 1 : 0;
1951         if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
1952                 printk(stat_nam[state]);
1953         else
1954 @@ -1172,7 +1235,7 @@
1955                 printk(" (NOTLB)\n");
1956
1957         {
1958 -               extern void show_trace_task(struct task_struct *tsk);
1959 +               extern void show_trace_task(task_t *tsk);
1960                 show_trace_task(p);
1961         }
1962  }
1963 @@ -1194,7 +1257,7 @@
1964
1965  void show_state(void)
1966  {
1967 -       struct task_struct *p;
1968 +       task_t *p;
1969
1970  #if (BITS_PER_LONG == 32)
1971         printk("\n"
1972 @@ -1217,121 +1280,88 @@
1973         read_unlock(&tasklist_lock);
1974  }
1975
1976 -/**
1977 - * reparent_to_init() - Reparent the calling kernel thread to the init task.
1978 - *
1979 - * If a kernel thread is launched as a result of a system call, or if
1980 - * it ever exits, it should generally reparent itself to init so that
1981 - * it is correctly cleaned up on exit.
1982 - *
1983 - * The various task state such as scheduling policy and priority may have
1984 - * been inherited fro a user process, so we reset them to sane values here.
1985 - *
1986 - * NOTE that reparent_to_init() gives the caller full capabilities.
1987 - */
1988 -void reparent_to_init(void)
1989 +static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1990  {
1991 -       struct task_struct *this_task = current;
1992 -
1993 -       write_lock_irq(&tasklist_lock);
1994 -
1995 -       /* Reparent to init */
1996 -       REMOVE_LINKS(this_task);
1997 -       this_task->p_pptr = child_reaper;
1998 -       this_task->p_opptr = child_reaper;
1999 -       SET_LINKS(this_task);
2000 -
2001 -       /* Set the exit signal to SIGCHLD so we signal init on exit */
2002 -       this_task->exit_signal = SIGCHLD;
2003 -
2004 -       /* We also take the runqueue_lock while altering task fields
2005 -        * which affect scheduling decisions */
2006 -       spin_lock(&runqueue_lock);
2007 -
2008 -       this_task->ptrace = 0;
2009 -       this_task->nice = DEF_NICE;
2010 -       this_task->policy = SCHED_OTHER;
2011 -       /* cpus_allowed? */
2012 -       /* rt_priority? */
2013 -       /* signals? */
2014 -       this_task->cap_effective = CAP_INIT_EFF_SET;
2015 -       this_task->cap_inheritable = CAP_INIT_INH_SET;
2016 -       this_task->cap_permitted = CAP_FULL_SET;
2017 -       this_task->keep_capabilities = 0;
2018 -       memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim)));
2019 -       this_task->user = INIT_USER;
2020 -
2021 -       spin_unlock(&runqueue_lock);
2022 -       write_unlock_irq(&tasklist_lock);
2023 +       if (rq1 == rq2)
2024 +               spin_lock(&rq1->lock);
2025 +       else {
2026 +               if (rq1 < rq2) {
2027 +                       spin_lock(&rq1->lock);
2028 +                       spin_lock(&rq2->lock);
2029 +               } else {
2030 +                       spin_lock(&rq2->lock);
2031 +                       spin_lock(&rq1->lock);
2032 +               }
2033 +       }
2034  }
2035
2036 -/*
2037 - *     Put all the gunge required to become a kernel thread without
2038 - *     attached user resources in one place where it belongs.
2039 - */
2040 -
2041 -void daemonize(void)
2042 +static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
2043  {
2044 -       struct fs_struct *fs;
2045 -
2046 -
2047 -       /*
2048 -        * If we were started as result of loading a module, close all of the
2049 -        * user space pages.  We don't need them, and if we didn't close them
2050 -        * they would be locked into memory.
2051 -        */
2052 -       exit_mm(current);
2053 -
2054 -       current->session = 1;
2055 -       current->pgrp = 1;
2056 -       current->tty = NULL;
2057 -
2058 -       /* Become as one with the init task */
2059 -
2060 -       exit_fs(current);       /* current->fs->count--; */
2061 -       fs = init_task.fs;
2062 -       current->fs = fs;
2063 -       atomic_inc(&fs->count);
2064 -       exit_files(current);
2065 -       current->files = init_task.files;
2066 -       atomic_inc(&current->files->count);
2067 +       spin_unlock(&rq1->lock);
2068 +       if (rq1 != rq2)
2069 +               spin_unlock(&rq2->lock);
2070  }
2071
2072 -extern unsigned long wait_init_idle;
2073 -
2074 -void __init init_idle(void)
2075 +void __init init_idle(task_t *idle, int cpu)
2076  {
2077 -       struct schedule_data * sched_data;
2078 -       sched_data = &aligned_data[smp_processor_id()].schedule_data;
2079 +       runqueue_t *idle_rq = cpu_rq(cpu), *rq = idle->array->rq;
2080 +       unsigned long flags;
2081
2082 -       if (current != &init_task && task_on_runqueue(current)) {
2083 -               printk("UGH! (%d:%d) was on the runqueue, removing.\n",
2084 -                       smp_processor_id(), current->pid);
2085 -               del_from_runqueue(current);
2086 -       }
2087 -       sched_data->curr = current;
2088 -       sched_data->last_schedule = get_cycles();
2089 -       clear_bit(current->processor, &wait_init_idle);
2090 +       __save_flags(flags);
2091 +       __cli();
2092 +       double_rq_lock(idle_rq, rq);
2093 +
2094 +       idle_rq->curr = idle_rq->idle = idle;
2095 +       deactivate_task(idle, rq);
2096 +       idle->array = NULL;
2097 +       idle->prio = MAX_PRIO;
2098 +       idle->state = TASK_RUNNING;
2099 +       idle->cpu = cpu;
2100 +       double_rq_unlock(idle_rq, rq);
2101 +       idle->need_resched = 1;
2102 +       __restore_flags(flags);
2103  }
2104
2105 -extern void init_timervecs (void);
2106 +extern void init_timervecs(void);
2107 +extern void timer_bh(void);
2108 +extern void tqueue_bh(void);
2109 +extern void immediate_bh(void);
2110
2111  void __init sched_init(void)
2112  {
2113 +       runqueue_t *rq;
2114 +       int i, j, k;
2115 +
2116 +       for (i = 0; i < NR_CPUS; i++) {
2117 +               runqueue_t *rq = cpu_rq(i);
2118 +               prio_array_t *array;
2119 +
2120 +               rq->active = rq->arrays + 0;
2121 +               rq->expired = rq->arrays + 1;
2122 +               spin_lock_init(&rq->lock);
2123 +
2124 +               for (j = 0; j < 2; j++) {
2125 +                       array = rq->arrays + j;
2126 +                       array->rq = rq;
2127 +                       array->lock = &rq->lock;
2128 +                       for (k = 0; k < MAX_PRIO; k++) {
2129 +                               INIT_LIST_HEAD(array->queue + k);
2130 +                               __clear_bit(k, array->bitmap);
2131 +                       }
2132 +                       // delimiter for bitsearch
2133 +                       __set_bit(MAX_PRIO, array->bitmap);
2134 +               }
2135 +       }
2136         /*
2137          * We have to do a little magic to get the first
2138          * process right in SMP mode.
2139          */
2140 -       int cpu = smp_processor_id();
2141 -       int nr;
2142 -
2143 -       init_task.processor = cpu;
2144 -
2145 -       for(nr = 0; nr < PIDHASH_SZ; nr++)
2146 -               pidhash[nr] = NULL;
2147 +       rq = this_rq();
2148 +       rq->curr = current;
2149 +       rq->idle = current;
2150 +       wake_up_process(current);
2151
2152         init_timervecs();
2153 -
2154         init_bh(TIMER_BH, timer_bh);
2155         init_bh(TQUEUE_BH, tqueue_bh);
2156         init_bh(IMMEDIATE_BH, immediate_bh);
2157 @@ -1340,5 +1370,5 @@
2158          * The boot idle thread does lazy MMU switching as well:
2159          */
2160         atomic_inc(&init_mm.mm_count);
2161 -       enter_lazy_tlb(&init_mm, current, cpu);
2162 +       enter_lazy_tlb(&init_mm, current, smp_processor_id());
2163  }
2164 --- linux/kernel/exit.c.orig    Sun Jan  6 13:55:56 2002
2165 +++ linux/kernel/exit.c Mon Jan 28 18:01:36 2002
2166 @@ -27,49 +27,42 @@
2167
2168  static void release_task(struct task_struct * p)
2169  {
2170 -       if (p != current) {
2171 +       unsigned long flags;
2172 +
2173 +       if (p == current)
2174 +               BUG();
2175  #ifdef CONFIG_SMP
2176 -               /*
2177 -                * Wait to make sure the process isn't on the
2178 -                * runqueue (active on some other CPU still)
2179 -                */
2180 -               for (;;) {
2181 -                       task_lock(p);
2182 -                       if (!task_has_cpu(p))
2183 -                               break;
2184 -                       task_unlock(p);
2185 -                       do {
2186 -                               cpu_relax();
2187 -                               barrier();
2188 -                       } while (task_has_cpu(p));
2189 -               }
2190 -               task_unlock(p);
2191 +       wait_task_inactive(p);
2192  #endif
2193 -               atomic_dec(&p->user->processes);
2194 -               free_uid(p->user);
2195 -               unhash_process(p);
2196 -
2197 -               release_thread(p);
2198 -               current->cmin_flt += p->min_flt + p->cmin_flt;
2199 -               current->cmaj_flt += p->maj_flt + p->cmaj_flt;
2200 -               current->cnswap += p->nswap + p->cnswap;
2201 -               /*
2202 -                * Potentially available timeslices are retrieved
2203 -                * here - this way the parent does not get penalized
2204 -                * for creating too many processes.
2205 -                *
2206 -                * (this cannot be used to artificially 'generate'
2207 -                * timeslices, because any timeslice recovered here
2208 -                * was given away by the parent in the first place.)
2209 -                */
2210 -               current->counter += p->counter;
2211 -               if (current->counter >= MAX_COUNTER)
2212 -                       current->counter = MAX_COUNTER;
2213 -               p->pid = 0;
2214 -               free_task_struct(p);
2215 -       } else {
2216 -               printk("task releasing itself\n");
2217 -       }
2218 +       atomic_dec(&p->user->processes);
2219 +       free_uid(p->user);
2220 +       unhash_process(p);
2221 +
2222 +       release_thread(p);
2223 +       current->cmin_flt += p->min_flt + p->cmin_flt;
2224 +       current->cmaj_flt += p->maj_flt + p->cmaj_flt;
2225 +       current->cnswap += p->nswap + p->cnswap;
2226 +       /*
2227 +        * Potentially available timeslices are retrieved
2228 +        * here - this way the parent does not get penalized
2229 +        * for creating too many processes.
2230 +        *
2231 +        * (this cannot be used to artificially 'generate'
2232 +        * timeslices, because any timeslice recovered here
2233 +        * was given away by the parent in the first place.)
2234 +        */
2235 +       __save_flags(flags);
2236 +       __cli();
2237 +       current->time_slice += p->time_slice;
2238 +       if (current->time_slice > MAX_TIMESLICE)
2239 +               current->time_slice = MAX_TIMESLICE;
2240 +       if (p->sleep_avg < current->sleep_avg)
2241 +               current->sleep_avg = (current->sleep_avg * EXIT_WEIGHT +
2242 +                       p->sleep_avg) / (EXIT_WEIGHT + 1);
2243 +       __restore_flags(flags);
2244 +
2245 +       p->pid = 0;
2246 +       free_task_struct(p);
2247  }
2248
2249  /*
2250 @@ -147,6 +140,79 @@
2251         }
2252         read_unlock(&tasklist_lock);
2253         return retval;
2254 +}
2255 +
2256 +/**
2257 + * reparent_to_init() - Reparent the calling kernel thread to the init task.
2258 + *
2259 + * If a kernel thread is launched as a result of a system call, or if
2260 + * it ever exits, it should generally reparent itself to init so that
2261 + * it is correctly cleaned up on exit.
2262 + *
2263 + * The various task state such as scheduling policy and priority may have
2264 + * been inherited from a user process, so we reset them to sane values here.
2265 + *
2266 + * NOTE that reparent_to_init() gives the caller full capabilities.
2267 + */
2268 +void reparent_to_init(void)
2269 +{
2270 +       write_lock_irq(&tasklist_lock);
2271 +
2272 +       /* Reparent to init */
2273 +       REMOVE_LINKS(current);
2274 +       current->p_pptr = child_reaper;
2275 +       current->p_opptr = child_reaper;
2276 +       SET_LINKS(current);
2277 +
2278 +       /* Set the exit signal to SIGCHLD so we signal init on exit */
2279 +       current->exit_signal = SIGCHLD;
2280 +
2281 +       current->ptrace = 0;
2282 +       if ((current->policy == SCHED_OTHER) && (current->__nice < DEF_USER_NICE))
2283 +               set_user_nice(current, DEF_USER_NICE);
2284 +       /* cpus_allowed? */
2285 +       /* rt_priority? */
2286 +       /* signals? */
2287 +       current->cap_effective = CAP_INIT_EFF_SET;
2288 +       current->cap_inheritable = CAP_INIT_INH_SET;
2289 +       current->cap_permitted = CAP_FULL_SET;
2290 +       current->keep_capabilities = 0;
2291 +       memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
2292 +       current->user = INIT_USER;
2293 +
2294 +       write_unlock_irq(&tasklist_lock);
2295 +}
2296 +
2297 +/*
2298 + *     Put all the gunge required to become a kernel thread without
2299 + *     attached user resources in one place where it belongs.
2300 + */
2301 +
2302 +void daemonize(void)
2303 +{
2304 +       struct fs_struct *fs;
2305 +
2306 +
2307 +       /*
2308 +        * If we were started as result of loading a module, close all of the
2309 +        * user space pages.  We don't need them, and if we didn't close them
2310 +        * they would be locked into memory.
2311 +        */
2312 +       exit_mm(current);
2313 +
2314 +       current->session = 1;
2315 +       current->pgrp = 1;
2316 +       current->tty = NULL;
2317 +
2318 +       /* Become as one with the init task */
2319 +
2320 +       exit_fs(current);       /* current->fs->count--; */
2321 +       fs = init_task.fs;
2322 +       current->fs = fs;
2323 +       atomic_inc(&fs->count);
2324 +       exit_files(current);
2325 +       current->files = init_task.files;
2326 +       atomic_inc(&current->files->count);
2327  }
2328
2329  /*
2330 --- linux/kernel/capability.c.orig      Sat Jun 24 06:06:37 2000
2331 +++ linux/kernel/capability.c   Sun Jan  6 13:56:25 2002
2332 @@ -8,6 +8,8 @@
2333  #include <linux/mm.h>
2334  #include <asm/uaccess.h>
2335
2336 +unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
2337 +
2338  kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
2339
2340  /* Note: never hold tasklist_lock while spinning for this one */
2341 --- linux/kernel/timer.c.orig   Sun Jan  6 13:55:49 2002
2342 +++ linux/kernel/timer.c        Mon Jan 21 12:53:05 2002
2343 @@ -25,6 +25,8 @@
2344
2345  #include <asm/uaccess.h>
2346
2347 +struct kernel_stat kstat;
2348 +
2349  /*
2350   * Timekeeping variables
2351   */
2352 @@ -583,17 +585,16 @@
2353
2354         update_one_process(p, user_tick, system, cpu);
2355         if (p->pid) {
2356 -               if (--p->counter <= 0) {
2357 -                       p->counter = 0;
2358 -                       p->need_resched = 1;
2359 -               }
2360 -               if (p->nice > 0)
2361 +               if (p->__nice > 0)
2362                         kstat.per_cpu_nice[cpu] += user_tick;
2363                 else
2364                         kstat.per_cpu_user[cpu] += user_tick;
2365                 kstat.per_cpu_system[cpu] += system;
2366 -       } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
2367 -               kstat.per_cpu_system[cpu] += system;
2368 +       } else {
2369 +               if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
2370 +                       kstat.per_cpu_system[cpu] += system;
2371 +       }
2372 +       scheduler_tick(p);
2373  }
2374
2375  /*
2376 @@ -794,6 +795,89 @@
2377
2378  #endif
2379
2380 +static void process_timeout(unsigned long __data)
2381 +{
2382 +       wake_up_process((task_t *)__data);
2383 +}
2384 +
2385 +/**
2386 + * schedule_timeout - sleep until timeout
2387 + * @timeout: timeout value in jiffies
2388 + *
2389 + * Make the current task sleep until @timeout jiffies have
2390 + * elapsed. The routine will return immediately unless
2391 + * the current task state has been set (see set_current_state()).
2392 + *
2393 + * You can set the task state as follows -
2394 + *
2395 + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
2396 + * pass before the routine returns. The routine will return 0
2397 + *
2398 + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
2399 + * delivered to the current task. In this case the remaining time
2400 + * in jiffies will be returned, or 0 if the timer expired in time
2401 + *
2402 + * The current task state is guaranteed to be TASK_RUNNING when this
2403 + * routine returns.
2404 + *
2405 + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
2406 + * the CPU away without a bound on the timeout. In this case the return
2407 + * value will be %MAX_SCHEDULE_TIMEOUT.
2408 + *
2409 + * In all cases the return value is guaranteed to be non-negative.
2410 + */
2411 +signed long schedule_timeout(signed long timeout)
2412 +{
2413 +       struct timer_list timer;
2414 +       unsigned long expire;
2415 +
2416 +       switch (timeout)
2417 +       {
2418 +       case MAX_SCHEDULE_TIMEOUT:
2419 +               /*
2420 +                * These two special cases are useful to be comfortable
2421 +                * in the caller. Nothing more. We could take
2422 +                * MAX_SCHEDULE_TIMEOUT from one of the negative value
2423 +                * but I' d like to return a valid offset (>=0) to allow
2424 +                * the caller to do everything it want with the retval.
2425 +                */
2426 +               schedule();
2427 +               goto out;
2428 +       default:
2429 +               /*
2430 +                * Another bit of PARANOID. Note that the retval will be
2431 +                * 0 since no piece of kernel is supposed to do a check
2432 +                * for a negative retval of schedule_timeout() (since it
2433 +                * should never happens anyway). You just have the printk()
2434 +                * that will tell you if something is gone wrong and where.
2435 +                */
2436 +               if (timeout < 0)
2437 +               {
2438 +                       printk(KERN_ERR "schedule_timeout: wrong timeout "
2439 +                              "value %lx from %p\n", timeout,
2440 +                              __builtin_return_address(0));
2441 +                       current->state = TASK_RUNNING;
2442 +                       goto out;
2443 +               }
2444 +       }
2445 +
2446 +       expire = timeout + jiffies;
2447 +
2448 +       init_timer(&timer);
2449 +       timer.expires = expire;
2450 +       timer.data = (unsigned long) current;
2451 +       timer.function = process_timeout;
2452 +
2453 +       add_timer(&timer);
2454 +       schedule();
2455 +       del_timer_sync(&timer);
2456 +
2457 +       timeout = expire - jiffies;
2458 +
2459 + out:
2460 +       return timeout < 0 ? 0 : timeout;
2461 +}
2462 +
2463  /* Thread ID - the internal kernel "pid" */
2464  asmlinkage long sys_gettid(void)
2465  {
2466 @@ -840,4 +924,3 @@
2467         }
2468         return 0;
2469  }
2470 -
2471 --- linux/kernel/fork.c.orig    Sun Jan  6 13:55:56 2002
2472 +++ linux/kernel/fork.c Thu Jan 24 13:45:09 2002
2473 @@ -28,7 +28,6 @@
2474
2475  /* The idle threads do not count.. */
2476  int nr_threads;
2477 -int nr_running;
2478
2479  int max_threads;
2480  unsigned long total_forks;     /* Handle normal Linux uptimes. */
2481 @@ -36,6 +35,8 @@
2482
2483  struct task_struct *pidhash[PIDHASH_SZ];
2484
2485 +rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;  /* outer */
2486 +
2487  void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
2488  {
2489         unsigned long flags;
2490 @@ -563,6 +564,7 @@
2491             struct pt_regs *regs, unsigned long stack_size)
2492  {
2493         int retval;
2494 +       unsigned long flags;
2495         struct task_struct *p;
2496         struct completion vfork;
2497
2498 @@ -611,8 +613,7 @@
2499         copy_flags(clone_flags, p);
2500         p->pid = get_pid(clone_flags);
2501
2502 -       p->run_list.next = NULL;
2503 -       p->run_list.prev = NULL;
2504 +       INIT_LIST_HEAD(&p->run_list);
2505
2506         p->p_cptr = NULL;
2507         init_waitqueue_head(&p->wait_chldexit);
2508 @@ -638,14 +639,15 @@
2509  #ifdef CONFIG_SMP
2510         {
2511                 int i;
2512 -               p->cpus_runnable = ~0UL;
2513 -               p->processor = current->processor;
2514 +
2515                 /* ?? should we just memset this ?? */
2516                 for(i = 0; i < smp_num_cpus; i++)
2517 -                       p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
2518 +                       p->per_cpu_utime[cpu_logical_map(i)] =
2519 +                               p->per_cpu_stime[cpu_logical_map(i)] = 0;
2520                 spin_lock_init(&p->sigmask_lock);
2521         }
2522  #endif
2523 +       p->array = NULL;
2524         p->lock_depth = -1;             /* -1 = no lock */
2525         p->start_time = jiffies;
2526
2527 @@ -677,15 +679,27 @@
2528         p->pdeath_signal = 0;
2529
2530         /*
2531 -        * "share" dynamic priority between parent and child, thus the
2532 -        * total amount of dynamic priorities in the system doesnt change,
2533 -        * more scheduling fairness. This is only important in the first
2534 -        * timeslice, on the long run the scheduling behaviour is unchanged.
2535 +        * Share the timeslice between parent and child, thus the
2536 +        * total amount of pending timeslices in the system doesnt change,
2537 +        * resulting in more scheduling fairness.
2538          */
2539 -       p->counter = (current->counter + 1) >> 1;
2540 -       current->counter >>= 1;
2541 -       if (!current->counter)
2542 -               current->need_resched = 1;
2543 +       __save_flags(flags);
2544 +       __cli();
2545 +       if (!current->time_slice)
2546 +               BUG();
2547 +       p->time_slice = (current->time_slice + 1) >> 1;
2548 +       current->time_slice >>= 1;
2549 +       if (!current->time_slice) {
2550 +               /*
2551 +                * This case is rare, it happens when the parent has only
2552 +                * a single jiffy left from its timeslice. Taking the
2553 +                * runqueue lock is not a problem.
2554 +                */
2555 +               current->time_slice = 1;
2556 +               scheduler_tick(current);
2557 +       }
2558 +       p->sleep_timestamp = jiffies;
2559 +       __restore_flags(flags);
2560
2561         /*
2562          * Ok, add it to the run-queues and make it
2563 @@ -722,10 +736,23 @@
2564         if (p->ptrace & PT_PTRACED)
2565                 send_sig(SIGSTOP, p, 1);
2566
2567 +#define RUN_CHILD_FIRST 1
2568 +#if RUN_CHILD_FIRST
2569 +       wake_up_forked_process(p);      /* do this last */
2570 +#else
2571         wake_up_process(p);             /* do this last */
2572 +#endif
2573         ++total_forks;
2574         if (clone_flags & CLONE_VFORK)
2575                 wait_for_completion(&vfork);
2576 +#if RUN_CHILD_FIRST
2577 +       else
2578 +               /*
2579 +                * Let the child process run first, to avoid most of the
2580 +                * COW overhead when the child exec()s afterwards.
2581 +                */
2582 +               current->need_resched = 1;
2583 +#endif
2584
2585  fork_out:
2586         return retval;
2587 --- linux/kernel/softirq.c.orig Sun Jan  6 13:55:53 2002
2588 +++ linux/kernel/softirq.c      Wed Jan 16 00:52:11 2002
2589 @@ -259,10 +259,9 @@
2590
2591         while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
2592                 current->state = TASK_RUNNING;
2593 -               do {
2594 -                       current->policy |= SCHED_YIELD;
2595 -                       schedule();
2596 -               } while (test_bit(TASKLET_STATE_SCHED, &t->state));
2597 +               do
2598 +                       sys_sched_yield();
2599 +               while (test_bit(TASKLET_STATE_SCHED, &t->state));
2600         }
2601         tasklet_unlock_wait(t);
2602         clear_bit(TASKLET_STATE_SCHED, &t->state);
2603 @@ -365,13 +364,13 @@
2604         int cpu = cpu_logical_map(bind_cpu);
2605
2606         daemonize();
2607 -       current->nice = 19;
2608 +       set_user_nice(current, 19);
2609         sigfillset(&current->blocked);
2610
2611         /* Migrate to the right CPU */
2612 -       current->cpus_allowed = 1UL << cpu;
2613 -       while (smp_processor_id() != cpu)
2614 -               schedule();
2615 +       set_cpus_allowed(current, 1UL << cpu);
2616 +       if (cpu() != cpu)
2617 +               BUG();
2618
2619         sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu);
2620
2621 @@ -396,7 +395,7 @@
2622         }
2623  }
2624
2625 -static __init int spawn_ksoftirqd(void)
2626 +__init int spawn_ksoftirqd(void)
2627  {
2628         int cpu;
2629
2630 @@ -405,14 +404,12 @@
2631                                   CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0)
2632                         printk("spawn_ksoftirqd() failed for cpu %d\n", cpu);
2633                 else {
2634 -                       while (!ksoftirqd_task(cpu_logical_map(cpu))) {
2635 -                               current->policy |= SCHED_YIELD;
2636 -                               schedule();
2637 -                       }
2638 +                       while (!ksoftirqd_task(cpu_logical_map(cpu)))
2639 +                               sys_sched_yield();
2640                 }
2641         }
2642
2643         return 0;
2644  }
2645
2646 -__initcall(spawn_ksoftirqd);
2647 +__initcall(spawn_ksoftirqd);
2648 --- linux/kernel/ptrace.c.orig  Sun Jan  6 13:55:57 2002
2649 +++ linux/kernel/ptrace.c       Sun Jan  6 13:56:25 2002
2650 @@ -31,20 +31,7 @@
2651                 if (child->state != TASK_STOPPED)
2652                         return -ESRCH;
2653  #ifdef CONFIG_SMP
2654 -               /* Make sure the child gets off its CPU.. */
2655 -               for (;;) {
2656 -                       task_lock(child);
2657 -                       if (!task_has_cpu(child))
2658 -                               break;
2659 -                       task_unlock(child);
2660 -                       do {
2661 -                               if (child->state != TASK_STOPPED)
2662 -                                       return -ESRCH;
2663 -                               barrier();
2664 -                               cpu_relax();
2665 -                       } while (task_has_cpu(child));
2666 -               }
2667 -               task_unlock(child);
2668 +               wait_task_inactive(child);
2669  #endif
2670         }
2671
2672 --- linux/kernel/sys.c.orig     Sun Jan  6 13:55:47 2002
2673 +++ linux/kernel/sys.c  Sun Jan  6 13:56:25 2002
2674 @@ -220,10 +220,10 @@
2675                 }
2676                 if (error == -ESRCH)
2677                         error = 0;
2678 -               if (niceval < p->nice && !capable(CAP_SYS_NICE))
2679 +               if (niceval < p->__nice && !capable(CAP_SYS_NICE))
2680                         error = -EACCES;
2681                 else
2682 -                       p->nice = niceval;
2683 +                       set_user_nice(p, niceval);
2684         }
2685         read_unlock(&tasklist_lock);
2686
2687 @@ -249,7 +249,7 @@
2688                 long niceval;
2689                 if (!proc_sel(p, which, who))
2690                         continue;
2691 -               niceval = 20 - p->nice;
2692 +               niceval = 20 - p->__nice;
2693                 if (niceval > retval)
2694                         retval = niceval;
2695         }
2696 --- linux/kernel/signal.c.orig  Sun Jan  6 13:55:56 2002
2697 +++ linux/kernel/signal.c       Sun Jan  6 13:56:25 2002
2698 @@ -478,12 +478,9 @@
2699          * process of changing - but no harm is done by that
2700          * other than doing an extra (lightweight) IPI interrupt.
2701          */
2702 -       spin_lock(&runqueue_lock);
2703 -       if (task_has_cpu(t) && t->processor != smp_processor_id())
2704 -               smp_send_reschedule(t->processor);
2705 -       spin_unlock(&runqueue_lock);
2706 -#endif /* CONFIG_SMP */
2707 -
2708 +       if ((t->state == TASK_RUNNING) && (t->cpu != cpu()))
2709 +               kick_if_running(t);
2710 +#endif
2711         if (t->state & TASK_INTERRUPTIBLE) {
2712                 wake_up_process(t);
2713                 return;
2714 --- linux/kernel/printk.c.orig  Sun Jan  6 13:55:57 2002
2715 +++ linux/kernel/printk.c       Sun Jan  6 13:56:25 2002
2716 @@ -25,6 +25,7 @@
2717  #include <linux/module.h>
2718  #include <linux/interrupt.h>                   /* For in_interrupt() */
2719  #include <linux/config.h>
2720 +#include <linux/delay.h>
2721
2722  #include <asm/uaccess.h>
2723
2724 --- linux/kernel/ksyms.c.orig   Sun Jan  6 13:55:57 2002
2725 +++ linux/kernel/ksyms.c        Thu Jan 10 22:55:43 2002
2726 @@ -437,6 +437,9 @@
2727  EXPORT_SYMBOL(interruptible_sleep_on_timeout);
2728  EXPORT_SYMBOL(schedule);
2729  EXPORT_SYMBOL(schedule_timeout);
2730 +EXPORT_SYMBOL(sys_sched_yield);
2731 +EXPORT_SYMBOL(set_user_nice);
2732 +EXPORT_SYMBOL(set_cpus_allowed);
2733  EXPORT_SYMBOL(jiffies);
2734  EXPORT_SYMBOL(xtime);
2735  EXPORT_SYMBOL(do_gettimeofday);
2736 @@ -448,6 +451,7 @@
2737
2738  EXPORT_SYMBOL(kstat);
2739  EXPORT_SYMBOL(nr_running);
2740 +EXPORT_SYMBOL(nr_context_switches);
2741
2742  /* misc */
2743  EXPORT_SYMBOL(panic);
2744 --- linux/mm/oom_kill.c.orig    Sun Jan  6 13:55:53 2002
2745 +++ linux/mm/oom_kill.c Sun Jan  6 13:56:25 2002
2746 @@ -82,7 +82,7 @@
2747          * Niced processes are most likely less important, so double
2748          * their badness points.
2749          */
2750 -       if (p->nice > 0)
2751 +       if (p->__nice > 0)
2752                 points *= 2;
2753
2754         /*
2755 @@ -149,7 +149,7 @@
2756          * all the memory it needs. That way it should be able to
2757          * exit() and clear out its resources quickly...
2758          */
2759 -       p->counter = 5 * HZ;
2760 +       p->time_slice = 2 * MAX_TIMESLICE;
2761         p->flags |= PF_MEMALLOC | PF_MEMDIE;
2762
2763         /* This process has hardware access, be more careful. */
2764 @@ -188,8 +188,7 @@
2765          * killing itself before someone else gets the chance to ask
2766          * for more memory.
2767          */
2768 -       current->policy |= SCHED_YIELD;
2769 -       schedule();
2770 +       yield();
2771         return;
2772  }
2773
2774 --- linux/mm/page_alloc.c.orig  Sun Jan  6 13:55:56 2002
2775 +++ linux/mm/page_alloc.c       Fri Jan 25 14:26:36 2002
2776 @@ -394,9 +394,8 @@
2777                 return NULL;
2778
2779         /* Yield for kswapd, and try again */
2780 -       current->policy |= SCHED_YIELD;
2781         __set_current_state(TASK_RUNNING);
2782 -       schedule();
2783 +       yield();
2784         goto rebalance;
2785  }
2786
2787 --- linux/mm/highmem.c.orig     Sun Jan  6 13:55:57 2002
2788 +++ linux/mm/highmem.c  Fri Jan 25 14:26:56 2002
2789 @@ -354,9 +354,8 @@
2790         /* we need to wait I/O completion */
2791         run_task_queue(&tq_disk);
2792
2793 -       current->policy |= SCHED_YIELD;
2794         __set_current_state(TASK_RUNNING);
2795 -       schedule();
2796 +       yield();
2797         goto repeat_alloc;
2798  }
2799
2800 @@ -392,9 +391,8 @@
2801         /* we need to wait I/O completion */
2802         run_task_queue(&tq_disk);
2803
2804 -       current->policy |= SCHED_YIELD;
2805         __set_current_state(TASK_RUNNING);
2806 -       schedule();
2807 +       yield();
2808         goto repeat_alloc;
2809  }
2810
2811 --- linux/include/linux/sched.h.orig    Sun Jan  6 13:55:57 2002
2812 +++ linux/include/linux/sched.h Mon Jan 28 18:48:01 2002
2813 @@ -6,6 +6,7 @@
2814  extern unsigned long event;
2815
2816  #include <linux/config.h>
2817 +#include <linux/compiler.h>
2818  #include <linux/binfmts.h>
2819  #include <linux/threads.h>
2820  #include <linux/kernel.h>
2821 @@ -42,6 +43,7 @@
2822  #define CLONE_VFORK    0x00004000      /* set if the parent wants the child to wake it up on mm_release */
2823  #define CLONE_PARENT   0x00008000      /* set if we want to have the same parent as the cloner */
2824  #define CLONE_THREAD   0x00010000      /* Same thread group? */
2825 +#define CLONE_NEWNS    0x00020000      /* New namespace group? */
2826
2827  #define CLONE_SIGNAL   (CLONE_SIGHAND | CLONE_THREAD)
2828
2829 @@ -72,8 +74,9 @@
2830  #define CT_TO_SECS(x)  ((x) / HZ)
2831  #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ)
2832
2833 -extern int nr_running, nr_threads;
2834 +extern int nr_threads;
2835  extern int last_pid;
2836 +extern unsigned long nr_running(void);
2837
2838  #include <linux/fs.h>
2839  #include <linux/time.h>
2840 @@ -116,12 +119,6 @@
2841  #define SCHED_FIFO             1
2842  #define SCHED_RR               2
2843
2844 -/*
2845 - * This is an additional bit set when we want to
2846 - * yield the CPU for one re-schedule..
2847 - */
2848 -#define SCHED_YIELD            0x10
2849 -
2850  struct sched_param {
2851         int sched_priority;
2852  };
2853 @@ -139,17 +136,22 @@
2854   * a separate lock).
2855   */
2856  extern rwlock_t tasklist_lock;
2857 -extern spinlock_t runqueue_lock;
2858  extern spinlock_t mmlist_lock;
2859
2860 +typedef struct task_struct task_t;
2861 +
2862  extern void sched_init(void);
2863 -extern void init_idle(void);
2864 +extern void init_idle(task_t *idle, int cpu);
2865  extern void show_state(void);
2866  extern void cpu_init (void);
2867  extern void trap_init(void);
2868  extern void update_process_times(int user);
2869 -extern void update_one_process(struct task_struct *p, unsigned long user,
2870 +extern void update_one_process(task_t *p, unsigned long user,
2871                                unsigned long system, int cpu);
2872 +extern void scheduler_tick(task_t *p);
2873 +extern void sched_task_migrated(task_t *p);
2874 +extern void smp_migrate_task(int cpu, task_t *task);
2875 +extern unsigned long cache_decay_ticks;
2876
2877  #define        MAX_SCHEDULE_TIMEOUT    LONG_MAX
2878  extern signed long FASTCALL(schedule_timeout(signed long timeout));
2879 @@ -166,6 +168,7 @@
2880   */
2881  #define NR_OPEN_DEFAULT BITS_PER_LONG
2882
2883 +struct namespace;
2884  /*
2885   * Open file table structure
2886   */
2887 @@ -278,6 +281,8 @@
2888  extern struct user_struct root_user;
2889  #define INIT_USER (&root_user)
2890
2891 +typedef struct prio_array prio_array_t;
2892 +
2893  struct task_struct {
2894         /*
2895          * offsets of these are hardcoded elsewhere - touch with care
2896 @@ -295,35 +300,28 @@
2897
2898         int lock_depth;         /* Lock depth */
2899
2900 -/*
2901 - * offset 32 begins here on 32-bit platforms. We keep
2902 - * all fields in a single cacheline that are needed for
2903 - * the goodness() loop in schedule().
2904 - */
2905 -       long counter;
2906 -       long nice;
2907 -       unsigned long policy;
2908 -       struct mm_struct *mm;
2909 -       int processor;
2910         /*
2911 -        * cpus_runnable is ~0 if the process is not running on any
2912 -        * CPU. It's (1 << cpu) if it's running on a CPU. This mask
2913 -        * is updated under the runqueue lock.
2914 -        *
2915 -        * To determine whether a process might run on a CPU, this
2916 -        * mask is AND-ed with cpus_allowed.
2917 +        * offset 32 begins here on 32-bit platforms.
2918          */
2919 -       unsigned long cpus_runnable, cpus_allowed;
2920 -       /*
2921 -        * (only the 'next' pointer fits into the cacheline, but
2922 -        * that's just fine.)
2923 -        */
2924 -       struct list_head run_list;
2925 -       unsigned long sleep_time;
2926 +       unsigned int cpu;
2927 +       int prio;
2928 +       long __nice;
2929 +       list_t run_list;
2930 +       prio_array_t *array;
2931 +
2932 +       unsigned int time_slice;
2933
2934 -       struct task_struct *next_task, *prev_task;
2935 -       struct mm_struct *active_mm;
2936 +       unsigned long sleep_avg;
2937 +       unsigned long sleep_timestamp;
2938 +
2939 +       unsigned long policy;
2940 +       unsigned long cpus_allowed;
2941 +
2942 +       task_t *next_task, *prev_task;
2943 +
2944 +       struct mm_struct *mm, *active_mm;
2945         struct list_head local_pages;
2946 +
2947         unsigned int allocation_order, nr_local_pages;
2948
2949  /* task state */
2950 @@ -345,12 +343,12 @@
2951          * older sibling, respectively.  (p->father can be replaced with
2952          * p->p_pptr->pid)
2953          */
2954 -       struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
2955 +       task_t *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
2956         struct list_head thread_group;
2957
2958         /* PID hash table linkage. */
2959 -       struct task_struct *pidhash_next;
2960 -       struct task_struct **pidhash_pprev;
2961 +       task_t *pidhash_next;
2962 +       task_t **pidhash_pprev;
2963
2964         wait_queue_head_t wait_chldexit;        /* for wait4() */
2965         struct completion *vfork_done;          /* for vfork() */
2966 @@ -389,6 +387,8 @@
2967         struct fs_struct *fs;
2968  /* open file information */
2969         struct files_struct *files;
2970 +/* namespace */
2971 +       struct namespace *namespace;
2972  /* signal handlers */
2973         spinlock_t sigmask_lock;        /* Protects signal and blocked */
2974         struct signal_struct *sig;
2975 @@ -446,10 +446,66 @@
2976   */
2977  #define _STK_LIM       (8*1024*1024)
2978
2979 -#define DEF_COUNTER    (10*HZ/100)     /* 100 ms time slice */
2980 -#define MAX_COUNTER    (20*HZ/100)
2981 -#define DEF_NICE       (0)
2982 +/*
2983 + * RT priorites go from 0 to 99, but internally we max
2984 + * them out at 128 to make it easier to search the
2985 + * scheduler bitmap.
2986 + */
2987 +#define MAX_RT_PRIO            128
2988 +/*
2989 + * The lower the priority of a process, the more likely it is
2990 + * to run. Priority of a process goes from 0 to 167. The 0-99
2991 + * priority range is allocated to RT tasks, the 128-167 range
2992 + * is for SCHED_OTHER tasks.
2993 + */
2994 +#define MAX_PRIO               (MAX_RT_PRIO + 40)
2995 +
2996 +/*
2997 + * Scales user-nice values [ -20 ... 0 ... 19 ]
2998 + * to static priority [ 128 ... 167 (MAX_PRIO-1) ]
2999 + *
3000 + * User-nice value of -20 == static priority 128, and
3001 + * user-nice value 19 == static priority 167. The lower
3002 + * the priority value, the higher the task's priority.
3003 + */
3004 +#define NICE_TO_PRIO(n)                (MAX_RT_PRIO + (n) + 20)
3005 +#define DEF_USER_NICE          0
3006 +
3007 +/*
3008 + * Default timeslice is 250 msecs, maximum is 500 msecs.
3009 + * Minimum timeslice is 10 msecs.
3010 + */
3011 +#define MIN_TIMESLICE          ( 10 * HZ / 1000)
3012 +#define MAX_TIMESLICE           (300 * HZ / 1000)
3013 +#define CHILD_FORK_PENALTY      95
3014 +#define PARENT_FORK_PENALTY     100
3015 +#define EXIT_WEIGHT             3
3016 +#define PRIO_INTERACTIVE_RATIO  20
3017 +#define PRIO_CPU_HOG_RATIO      60
3018 +#define PRIO_BONUS_RATIO        70
3019 +#define INTERACTIVE_DELTA       3
3020 +#define MAX_SLEEP_AVG           (2*HZ)
3021 +#define STARVATION_LIMIT        (2*HZ)
3022 +
3023 +#define USER_PRIO(p)           ((p)-MAX_RT_PRIO)
3024 +#define MAX_USER_PRIO          (USER_PRIO(MAX_PRIO))
3025 +
3026 +/*
3027 + * NICE_TO_TIMESLICE scales nice values [ -20 ... 19 ]
3028 + * to time slice values.
3029 + *
3030 + * The higher a process's priority, the bigger timeslices
3031 + * it gets during one round of execution. But even the lowest
3032 + * priority process gets MIN_TIMESLICE worth of execution time.
3033 + */
3034
3035 +#define NICE_TO_TIMESLICE(n) (MIN_TIMESLICE + \
3036 +       ((MAX_TIMESLICE - MIN_TIMESLICE) * (19-(n))) / 39)
3037 +
3038 +extern void set_cpus_allowed(task_t *p, unsigned long new_mask);
3039 +extern void set_user_nice(task_t *p, long nice);
3040 +asmlinkage long sys_sched_yield(void);
3041 +#define yield() sys_sched_yield()
3042
3043  /*
3044   * The default (Linux) execution domain.
3045 @@ -468,14 +524,13 @@
3046      addr_limit:                KERNEL_DS,                                      \
3047      exec_domain:       &default_exec_domain,                           \
3048      lock_depth:                -1,                                             \
3049 -    counter:           DEF_COUNTER,                                    \
3050 -    nice:              DEF_NICE,                                       \
3051 +    __nice:            DEF_USER_NICE,                                  \
3052      policy:            SCHED_OTHER,                                    \
3053 +    cpus_allowed:      -1,                                             \
3054      mm:                        NULL,                                           \
3055      active_mm:         &init_mm,                                       \
3056 -    cpus_runnable:     -1,                                             \
3057 -    cpus_allowed:      -1,                                             \
3058      run_list:          LIST_HEAD_INIT(tsk.run_list),                   \
3059 +    time_slice:                HZ,                                             \
3060      next_task:         &tsk,                                           \
3061      prev_task:         &tsk,                                           \
3062      p_opptr:           &tsk,                                           \
3063 @@ -509,24 +564,24 @@
3064  #endif
3065
3066  union task_union {
3067 -       struct task_struct task;
3068 +       task_t task;
3069         unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
3070  };
3071
3072  extern union task_union init_task_union;
3073
3074  extern struct   mm_struct init_mm;
3075 -extern struct task_struct *init_tasks[NR_CPUS];
3076 +extern task_t *init_tasks[NR_CPUS];
3077
3078  /* PID hashing. (shouldnt this be dynamic?) */
3079  #define PIDHASH_SZ (4096 >> 2)
3080 -extern struct task_struct *pidhash[PIDHASH_SZ];
3081 +extern task_t *pidhash[PIDHASH_SZ];
3082
3083  #define pid_hashfn(x)  ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
3084
3085 -static inline void hash_pid(struct task_struct *p)
3086 +static inline void hash_pid(task_t *p)
3087  {
3088 -       struct task_struct **htable = &pidhash[pid_hashfn(p->pid)];
3089 +       task_t **htable = &pidhash[pid_hashfn(p->pid)];
3090
3091         if((p->pidhash_next = *htable) != NULL)
3092                 (*htable)->pidhash_pprev = &p->pidhash_next;
3093 @@ -534,16 +589,16 @@
3094         p->pidhash_pprev = htable;
3095  }
3096
3097 -static inline void unhash_pid(struct task_struct *p)
3098 +static inline void unhash_pid(task_t *p)
3099  {
3100         if(p->pidhash_next)
3101                 p->pidhash_next->pidhash_pprev = p->pidhash_pprev;
3102         *p->pidhash_pprev = p->pidhash_next;
3103  }
3104
3105 -static inline struct task_struct *find_task_by_pid(int pid)
3106 +static inline task_t *find_task_by_pid(int pid)
3107  {
3108 -       struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)];
3109 +       task_t *p, **htable = &pidhash[pid_hashfn(pid)];
3110
3111         for(p = *htable; p && p->pid != pid; p = p->pidhash_next)
3112                 ;
3113 @@ -551,19 +606,6 @@
3114         return p;
3115  }
3116
3117 -#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL)
3118 -
3119 -static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu)
3120 -{
3121 -       tsk->processor = cpu;
3122 -       tsk->cpus_runnable = 1UL << cpu;
3123 -}
3124 -
3125 -static inline void task_release_cpu(struct task_struct *tsk)
3126 -{
3127 -       tsk->cpus_runnable = ~0UL;
3128 -}
3129 -
3130  /* per-UID process charging. */
3131  extern struct user_struct * alloc_uid(uid_t);
3132  extern void free_uid(struct user_struct *);
3133 @@ -590,7 +632,8 @@
3134  extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
3135  extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
3136                                                     signed long timeout));
3137 -extern int FASTCALL(wake_up_process(struct task_struct * tsk));
3138 +extern int FASTCALL(wake_up_process(task_t * tsk));
3139 +extern void FASTCALL(wake_up_forked_process(task_t * tsk));
3140
3141  #define wake_up(x)                     __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
3142  #define wake_up_nr(x, nr)              __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
3143 @@ -608,28 +651,28 @@
3144  extern int in_egroup_p(gid_t);
3145
3146  extern void proc_caches_init(void);
3147 -extern void flush_signals(struct task_struct *);
3148 -extern void flush_signal_handlers(struct task_struct *);
3149 +extern void flush_signals(task_t *);
3150 +extern void flush_signal_handlers(task_t *);
3151  extern int dequeue_signal(sigset_t *, siginfo_t *);
3152  extern void block_all_signals(int (*notifier)(void *priv), void *priv,
3153                               sigset_t *mask);
3154  extern void unblock_all_signals(void);
3155 -extern int send_sig_info(int, struct siginfo *, struct task_struct *);
3156 -extern int force_sig_info(int, struct siginfo *, struct task_struct *);
3157 +extern int send_sig_info(int, struct siginfo *, task_t *);
3158 +extern int force_sig_info(int, struct siginfo *, task_t *);
3159  extern int kill_pg_info(int, struct siginfo *, pid_t);
3160  extern int kill_sl_info(int, struct siginfo *, pid_t);
3161  extern int kill_proc_info(int, struct siginfo *, pid_t);
3162 -extern void notify_parent(struct task_struct *, int);
3163 -extern void do_notify_parent(struct task_struct *, int);
3164 -extern void force_sig(int, struct task_struct *);
3165 -extern int send_sig(int, struct task_struct *, int);
3166 +extern void notify_parent(task_t *, int);
3167 +extern void do_notify_parent(task_t *, int);
3168 +extern void force_sig(int, task_t *);
3169 +extern int send_sig(int, task_t *, int);
3170  extern int kill_pg(pid_t, int, int);
3171  extern int kill_sl(pid_t, int, int);
3172  extern int kill_proc(pid_t, int, int);
3173  extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *);
3174  extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long);
3175
3176 -static inline int signal_pending(struct task_struct *p)
3177 +static inline int signal_pending(task_t *p)
3178  {
3179         return (p->sigpending != 0);
3180  }
3181 @@ -668,7 +711,7 @@
3182     This is required every time the blocked sigset_t changes.
3183     All callers should have t->sigmask_lock.  */
3184
3185 -static inline void recalc_sigpending(struct task_struct *t)
3186 +static inline void recalc_sigpending(task_t *t)
3187  {
3188         t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
3189  }
3190 @@ -775,16 +818,17 @@
3191  extern int expand_fdset(struct files_struct *, int nr);
3192  extern void free_fdset(fd_set *, int);
3193
3194 -extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
3195 +extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, task_t *, struct pt_regs *);
3196  extern void flush_thread(void);
3197  extern void exit_thread(void);
3198
3199 -extern void exit_mm(struct task_struct *);
3200 -extern void exit_files(struct task_struct *);
3201 -extern void exit_sighand(struct task_struct *);
3202 +extern void exit_mm(task_t *);
3203 +extern void exit_files(task_t *);
3204 +extern void exit_sighand(task_t *);
3205
3206  extern void reparent_to_init(void);
3207  extern void daemonize(void);
3208 +extern task_t *child_reaper;
3209
3210  extern int do_execve(char *, char **, char **, struct pt_regs *);
3211  extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long);
3212 @@ -793,6 +837,9 @@
3213  extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
3214  extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
3215
3216 +extern void wait_task_inactive(task_t * p);
3217 +extern void kick_if_running(task_t * p);
3218 +
3219  #define __wait_event(wq, condition)                                    \
3220  do {                                                                   \
3221         wait_queue_t __wait;                                            \
3222 @@ -871,24 +918,10 @@
3223         for (p = &init_task ; (p = p->next_task) != &init_task ; )
3224
3225  #define next_thread(p) \
3226 -       list_entry((p)->thread_group.next, struct task_struct, thread_group)
3227 -
3228 -static inline void del_from_runqueue(struct task_struct * p)
3229 -{
3230 -       nr_running--;
3231 -       p->sleep_time = jiffies;
3232 -       list_del(&p->run_list);
3233 -       p->run_list.next = NULL;
3234 -}
3235 -
3236 -static inline int task_on_runqueue(struct task_struct *p)
3237 -{
3238 -       return (p->run_list.next != NULL);
3239 -}
3240 +       list_entry((p)->thread_group.next, task_t, thread_group)
3241
3242 -static inline void unhash_process(struct task_struct *p)
3243 +static inline void unhash_process(task_t *p)
3244  {
3245 -       if (task_on_runqueue(p)) BUG();
3246         write_lock_irq(&tasklist_lock);
3247         nr_threads--;
3248         unhash_pid(p);
3249 @@ -898,12 +931,12 @@
3250  }
3251
3252  /* Protects ->fs, ->files, ->mm, and synchronises with wait4().  Nests inside tasklist_lock */
3253 -static inline void task_lock(struct task_struct *p)
3254 +static inline void task_lock(task_t *p)
3255  {
3256         spin_lock(&p->alloc_lock);
3257  }
3258
3259 -static inline void task_unlock(struct task_struct *p)
3260 +static inline void task_unlock(task_t *p)
3261  {
3262         spin_unlock(&p->alloc_lock);
3263  }
3264 --- linux/include/linux/list.h.orig     Sun Jan  6 13:55:57 2002
3265 +++ linux/include/linux/list.h  Mon Jan 28 18:48:00 2002
3266 @@ -19,6 +19,8 @@
3267         struct list_head *next, *prev;
3268  };
3269
3270 +typedef struct list_head list_t;
3271 +
3272  #define LIST_HEAD_INIT(name) { &(name), &(name) }
3273
3274  #define LIST_HEAD(name) \
3275 --- linux/include/linux/kernel_stat.h.orig      Tue Aug 21 14:26:23 2001
3276 +++ linux/include/linux/kernel_stat.h   Mon Jan 28 18:48:00 2002
3277 @@ -32,10 +32,11 @@
3278         unsigned int ipackets, opackets;
3279         unsigned int ierrors, oerrors;
3280         unsigned int collisions;
3281 -       unsigned int context_swtch;
3282  };
3283
3284  extern struct kernel_stat kstat;
3285 +
3286 +extern unsigned long nr_context_switches(void);
3287
3288  #if !defined(CONFIG_ARCH_S390)
3289  /*
3290 --- linux/include/linux/smp.h.orig      Sun Dec 31 20:10:17 2000
3291 +++ linux/include/linux/smp.h   Mon Jan 28 18:48:00 2002
3292 @@ -86,6 +86,14 @@
3293  #define cpu_number_map(cpu)                    0
3294  #define smp_call_function(func,info,retry,wait)        ({ 0; })
3295  #define cpu_online_map                         1
3296 +static inline void smp_send_reschedule(int cpu) { }
3297 +static inline void smp_send_reschedule_all(void) { }
3298
3299  #endif
3300 +
3301 +/*
3302 + * Common definitions:
3303 + */
3304 +#define cpu()                                  smp_processor_id()
3305 +
3306  #endif
3307 --- linux/include/asm-i386/smp.h.orig   Sun Jan  6 13:55:57 2002
3308 +++ linux/include/asm-i386/smp.h        Mon Jan 28 18:48:00 2002
3309 @@ -63,6 +63,7 @@
3310  extern void smp_flush_tlb(void);
3311  extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
3312  extern void smp_send_reschedule(int cpu);
3313 +extern void smp_send_reschedule_all(void);
3314  extern void smp_invalidate_rcv(void);          /* Process an NMI */
3315  extern void (*mtrr_hook) (void);
3316  extern void zap_low_mappings (void);
3317 @@ -104,7 +105,7 @@
3318   * so this is correct in the x86 case.
3319   */
3320
3321 -#define smp_processor_id() (current->processor)
3322 +#define smp_processor_id() (current->cpu)
3323
3324  static __inline int hard_smp_processor_id(void)
3325  {
3326 @@ -121,18 +122,6 @@
3327  #endif /* !__ASSEMBLY__ */
3328
3329  #define NO_PROC_ID             0xFF            /* No processor magic marker */
3330 -
3331 -/*
3332 - *     This magic constant controls our willingness to transfer
3333 - *     a process across CPUs. Such a transfer incurs misses on the L1
3334 - *     cache, and on a P6 or P5 with multiple L2 caches L2 hits. My
3335 - *     gut feeling is this will vary by board in value. For a board
3336 - *     with separate L2 cache it probably depends also on the RSS, and
3337 - *     for a board with shared L2 cache it ought to decay fast as other
3338 - *     processes are run.
3339 - */
3340 -
3341 -#define PROC_CHANGE_PENALTY    15              /* Schedule penalty */
3342
3343  #endif
3344  #endif
3345 --- linux/include/asm-i386/bitops.h.orig        Tue Aug 21 14:26:16 2001
3346 +++ linux/include/asm-i386/bitops.h     Mon Jan 28 18:48:00 2002
3347 @@ -75,6 +75,14 @@
3348                 :"=m" (ADDR)
3349                 :"Ir" (nr));
3350  }
3351 +
3352 +static __inline__ void __clear_bit(int nr, volatile void * addr)
3353 +{
3354 +       __asm__ __volatile__(
3355 +               "btrl %1,%0"
3356 +               :"=m" (ADDR)
3357 +               :"Ir" (nr));
3358 +}
3359  #define smp_mb__before_clear_bit()     barrier()
3360  #define smp_mb__after_clear_bit()      barrier()
3361
3362 @@ -284,6 +292,34 @@
3363  }
3364
3365  /**
3366 + * find_first_bit - find the first set bit in a memory region
3367 + * @addr: The address to start the search at
3368 + * @size: The maximum size to search
3369 + *
3370 + * Returns the bit-number of the first set bit, not the number of the byte
3371 + * containing a bit.
3372 + */
3373 +static __inline__ int find_first_bit(void * addr, unsigned size)
3374 +{
3375 +       int d0, d1;
3376 +       int res;
3377 +
3378 +       /* This looks at memory. Mark it volatile to tell gcc not to move it around */
3379 +       __asm__ __volatile__(
3380 +               "xorl %%eax,%%eax\n\t"
3381 +               "repe; scasl\n\t"
3382 +               "jz 1f\n\t"
3383 +               "leal -4(%%edi),%%edi\n\t"
3384 +               "bsfl (%%edi),%%eax\n"
3385 +               "1:\tsubl %%ebx,%%edi\n\t"
3386 +               "shll $3,%%edi\n\t"
3387 +               "addl %%edi,%%eax"
3388 +               :"=a" (res), "=&c" (d0), "=&D" (d1)
3389 +               :"1" ((size + 31) >> 5), "2" (addr), "b" (addr));
3390 +       return res;
3391 +}
3392 +
3393 +/**
3394   * find_next_zero_bit - find the first zero bit in a memory region
3395   * @addr: The address to base the search on
3396   * @offset: The bitnumber to start searching at
3397 @@ -296,7 +332,7 @@
3398
3399         if (bit) {
3400                 /*
3401 -                * Look for zero in first byte
3402 +                * Look for zero in the first 32 bits.
3403                  */
3404                 __asm__("bsfl %1,%0\n\t"
3405                         "jne 1f\n\t"
3406 @@ -317,6 +353,39 @@
3407  }
3408
3409  /**
3410 + * find_next_bit - find the first set bit in a memory region
3411 + * @addr: The address to base the search on
3412 + * @offset: The bitnumber to start searching at
3413 + * @size: The maximum size to search
3414 + */
3415 +static __inline__ int find_next_bit (void * addr, int size, int offset)
3416 +{
3417 +       unsigned long * p = ((unsigned long *) addr) + (offset >> 5);
3418 +       int set = 0, bit = offset & 31, res;
3419 +
3420 +       if (bit) {
3421 +               /*
3422 +                * Look for nonzero in the first 32 bits:
3423 +                */
3424 +               __asm__("bsfl %1,%0\n\t"
3425 +                       "jne 1f\n\t"
3426 +                       "movl $32, %0\n"
3427 +                       "1:"
3428 +                       : "=r" (set)
3429 +                       : "r" (*p >> bit));
3430 +               if (set < (32 - bit))
3431 +                       return set + offset;
3432 +               set = 32 - bit;
3433 +               p++;
3434 +       }
3435 +       /*
3436 +        * No set bit yet, search remaining full words for a bit
3437 +        */
3438 +       res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr));
3439 +       return (offset + set + res);
3440 +}
3441 +
3442 +/**
3443   * ffz - find first zero in word.
3444   * @word: The word to search
3445   *
3446 @@ -327,6 +396,20 @@
3447         __asm__("bsfl %1,%0"
3448                 :"=r" (word)
3449                 :"r" (~word));
3450 +       return word;
3451 +}
3452 +
3453 +/**
3454 + * __ffs - find first bit in word.
3455 + * @word: The word to search
3456 + *
3457 + * Undefined if no bit exists, so code should check against 0 first.
3458 + */
3459 +static __inline__ unsigned long __ffs(unsigned long word)
3460 +{
3461 +       __asm__("bsfl %1,%0"
3462 +               :"=r" (word)
3463 +               :"rm" (word));
3464         return word;
3465  }
3466
3467 --- linux/include/asm-i386/pgalloc.h.orig       Sun Jan  6 13:55:57 2002
3468 +++ linux/include/asm-i386/pgalloc.h    Mon Jan 28 18:48:00 2002
3469 @@ -224,6 +224,7 @@
3470  {
3471         struct mm_struct *active_mm;
3472         int state;
3473 +       char __cacheline_padding[24];
3474  };
3475  extern struct tlb_state cpu_tlbstate[NR_CPUS];
3476
3477 --- linux/include/asm-i386/mmu_context.h.orig   Tue Aug 21 14:26:23 2001
3478 +++ linux/include/asm-i386/mmu_context.h        Mon Jan 28 18:48:00 2002
3479 @@ -7,6 +7,31 @@
3480  #include <asm/pgalloc.h>
3481
3482  /*
3483 + * Every architecture must define this function. It's the fastest
3484 + * way of searching a 168-bit bitmap where the first 128 bits are
3485 + * unlikely to be set. It's guaranteed that at least one of the 168
3486 + * bits is cleared.
3487 + */
3488 +#if MAX_RT_PRIO != 128 || MAX_PRIO != 168
3489 +# error update this function.
3490 +#endif
3491 +
3492 +static inline int sched_find_first_bit(unsigned long *b)
3493 +{
3494 +       if (unlikely(b[0]))
3495 +               return __ffs(b[0]);
3496 +       if (unlikely(b[1]))
3497 +               return __ffs(b[1]) + 32;
3498 +       if (unlikely(b[2]))
3499 +               return __ffs(b[2]) + 64;
3500 +       if (unlikely(b[3]))
3501 +               return __ffs(b[3]) + 96;
3502 +       if (b[4])
3503 +               return __ffs(b[4]) + 128;
3504 +       return __ffs(b[5]) + 32 + 128;
3505 +}
3506 +
3507 +/*
3508   * possibly do the LDT unload here?
3509   */
3510  #define destroy_context(mm)            do { } while(0)
3511 @@ -27,13 +52,13 @@
3512
3513  static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu)
3514  {
3515 -       if (prev != next) {
3516 +       if (likely(prev != next)) {
3517                 /* stop flush ipis for the previous mm */
3518                 clear_bit(cpu, &prev->cpu_vm_mask);
3519                 /*
3520                  * Re-load LDT if necessary
3521                  */
3522 -               if (prev->context.segments != next->context.segments)
3523 +               if (unlikely(prev->context.segments != next->context.segments))
3524                         load_LDT(next);
3525  #ifdef CONFIG_SMP
3526                 cpu_tlbstate[cpu].state = TLBSTATE_OK;
3527 --- linux/include/asm-i386/hw_irq.h.orig        Wed Jan 16 21:44:01 2002
3528 +++ linux/include/asm-i386/hw_irq.h     Mon Jan 28 18:48:00 2002
3529 @@ -41,8 +41,9 @@
3530  #define ERROR_APIC_VECTOR      0xfe
3531  #define INVALIDATE_TLB_VECTOR  0xfd
3532  #define RESCHEDULE_VECTOR      0xfc
3533 -#define CALL_FUNCTION_VECTOR   0xfb
3534 -#define KDB_VECTOR             0xfa
3535 +#define TASK_MIGRATION_VECTOR  0xfb
3536 +#define CALL_FUNCTION_VECTOR   0xfa
3537 +#define KDB_VECTOR             0xf9
3538
3539  /*
3540   * Local APIC timer IRQ vector is on a different priority level,
3541 --- linux/include/asm-i386/apic.h.orig  Mon Jan 28 18:05:10 2002
3542 +++ linux/include/asm-i386/apic.h       Mon Jan 28 18:48:00 2002
3543 @@ -79,6 +79,8 @@
3544  extern void setup_apic_nmi_watchdog (void);
3545  extern inline void nmi_watchdog_tick (struct pt_regs * regs);
3546  extern int APIC_init_uniprocessor (void);
3547 +extern void disable_APIC_timer(void);
3548 +extern void enable_APIC_timer(void);
3549
3550  extern struct pm_dev *apic_pm_register(pm_dev_t, unsigned long, pm_callback);
3551  extern void apic_pm_unregister(struct pm_dev*);
3552 --- linux/net/unix/af_unix.c.orig       Sun Jan  6 13:55:58 2002
3553 +++ linux/net/unix/af_unix.c    Sun Jan  6 13:56:25 2002
3554 @@ -564,10 +564,8 @@
3555                                       addr->hash)) {
3556                 write_unlock(&unix_table_lock);
3557                 /* Sanity yield. It is unusual case, but yet... */
3558 -               if (!(ordernum&0xFF)) {
3559 -                       current->policy |= SCHED_YIELD;
3560 -                       schedule();
3561 -               }
3562 +               if (!(ordernum&0xFF))
3563 +                       yield();
3564                 goto retry;
3565         }
3566         addr->hash ^= sk->type;
3567 --- linux/net/ipv4/tcp_output.c.orig    Sun Jan  6 13:55:57 2002
3568 +++ linux/net/ipv4/tcp_output.c Sun Jan  6 13:56:25 2002
3569 @@ -1009,8 +1009,7 @@
3570                         skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
3571                         if (skb)
3572                                 break;
3573 -                       current->policy |= SCHED_YIELD;
3574 -                       schedule();
3575 +                       yield();
3576                 }
3577
3578                 /* Reserve space for headers and prepare control bits. */
3579 --- linux/net/sunrpc/sched.c.orig       Sun Jan  6 13:55:52 2002
3580 +++ linux/net/sunrpc/sched.c    Sun Jan  6 13:56:25 2002
3581 @@ -772,8 +772,7 @@
3582                 }
3583                 if (flags & RPC_TASK_ASYNC)
3584                         return NULL;
3585 -               current->policy |= SCHED_YIELD;
3586 -               schedule();
3587 +               yield();
3588         } while (!signalled());
3589
3590         return NULL;
3591 @@ -1114,8 +1113,7 @@
3592                 __rpc_schedule();
3593                 if (all_tasks) {
3594                         dprintk("rpciod_killall: waiting for tasks to exit\n");
3595 -                       current->policy |= SCHED_YIELD;
3596 -                       schedule();
3597 +                       yield();
3598                 }
3599         }
3600
3601 @@ -1185,8 +1183,7 @@
3602          * wait briefly before checking the process id.
3603          */
3604         current->sigpending = 0;
3605 -       current->policy |= SCHED_YIELD;
3606 -       schedule();
3607 +       yield();
3608         /*
3609          * Display a message if we're going to wait longer.
3610          */
3611 --- linux/net/sched/sch_generic.c.orig  Fri Aug 18 19:26:25 2000
3612 +++ linux/net/sched/sch_generic.c       Sun Jan  6 13:56:25 2002
3613 @@ -475,10 +475,8 @@
3614
3615         dev_watchdog_down(dev);
3616
3617 -       while (test_bit(__LINK_STATE_SCHED, &dev->state)) {
3618 -               current->policy |= SCHED_YIELD;
3619 -               schedule();
3620 -       }
3621 +       while (test_bit(__LINK_STATE_SCHED, &dev->state))
3622 +               yield();
3623
3624         spin_unlock_wait(&dev->xmit_lock);
3625  }
3626 --- linux/net/socket.c.orig     Sun Jan  6 13:55:58 2002
3627 +++ linux/net/socket.c  Sun Jan  6 13:56:25 2002
3628 @@ -148,8 +148,7 @@
3629         while (atomic_read(&net_family_lockct) != 0) {
3630                 spin_unlock(&net_family_lock);
3631
3632 -               current->policy |= SCHED_YIELD;
3633 -               schedule();
3634 +               yield();
3635
3636                 spin_lock(&net_family_lock);
3637         }
3638 --- linux/drivers/net/slip.c.orig       Sun Jan  6 13:55:48 2002
3639 +++ linux/drivers/net/slip.c    Sun Jan  6 13:56:25 2002
3640 @@ -1393,10 +1393,8 @@
3641                 /* First of all: check for active disciplines and hangup them.
3642                  */
3643                 do {
3644 -                       if (busy) {
3645 -                               current->counter = 0;
3646 -                               schedule();
3647 -                       }
3648 +                       if (busy)
3649 +                               sys_sched_yield();
3650
3651                         busy = 0;
3652                         local_bh_disable();
3653 --- linux/drivers/block/loop.c.orig     Sun Jan  6 13:55:56 2002
3654 +++ linux/drivers/block/loop.c  Sun Jan  6 13:56:25 2002
3655 @@ -570,9 +570,6 @@
3656         flush_signals(current);
3657         spin_unlock_irq(&current->sigmask_lock);
3658
3659 -       current->policy = SCHED_OTHER;
3660 -       current->nice = -20;
3661 -
3662         spin_lock_irq(&lo->lo_lock);
3663         lo->lo_state = Lo_bound;
3664         atomic_inc(&lo->lo_pending);
3665 --- linux/drivers/char/mwave/mwavedd.c.orig     Sun Jan 13 16:27:41 2002
3666 +++ linux/drivers/char/mwave/mwavedd.c  Sun Jan 13 16:28:05 2002
3667 @@ -279,7 +279,6 @@
3668                         pDrvData->IPCs[ipcnum].bIsHere = FALSE;
3669                         pDrvData->IPCs[ipcnum].bIsEnabled = TRUE;
3670         #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
3671 -                       current->nice = -20;    /* boost to provide priority timing */
3672         #else
3673                         current->priority = 0x28;       /* boost to provide priority timing */
3674         #endif
3675 --- linux/drivers/ide/ataraid.c.orig    Sun Jan  6 13:55:52 2002
3676 +++ linux/drivers/ide/ataraid.c Fri Jan 25 14:27:38 2002
3677 @@ -123,8 +123,7 @@
3678                 ptr=kmalloc(sizeof(struct buffer_head),GFP_NOIO);
3679                 if (!ptr) {
3680                         __set_current_state(TASK_RUNNING);
3681 -                       current->policy |= SCHED_YIELD;
3682 -                       schedule();
3683 +                       yield();
3684                 }
3685         }
3686         return ptr;
3687 @@ -139,8 +138,7 @@
3688                 ptr=kmalloc(sizeof(struct ataraid_bh_private),GFP_NOIO);
3689                 if (!ptr) {
3690                         __set_current_state(TASK_RUNNING);
3691 -                       current->policy |= SCHED_YIELD;
3692 -                       schedule();
3693 +                       yield();
3694                 }
3695         }
3696         return ptr;
3697 --- linux/drivers/md/md.c.orig  Sun Jan  6 13:55:56 2002
3698 +++ linux/drivers/md/md.c       Sun Jan  6 13:56:25 2002
3699 @@ -2930,8 +2930,6 @@
3700          * bdflush, otherwise bdflush will deadlock if there are too
3701          * many dirty RAID5 blocks.
3702          */
3703 -       current->policy = SCHED_OTHER;
3704 -       current->nice = -20;
3705         md_unlock_kernel();
3706
3707         complete(thread->event);
3708 @@ -3381,11 +3379,6 @@
3709                "(but not more than %d KB/sec) for reconstruction.\n",
3710                sysctl_speed_limit_max);
3711
3712 -       /*
3713 -        * Resync has low priority.
3714 -        */
3715 -       current->nice = 19;
3716 -
3717         is_mddev_idle(mddev); /* this also initializes IO event counters */
3718         for (m = 0; m < SYNC_MARKS; m++) {
3719                 mark[m] = jiffies;
3720 @@ -3463,16 +3456,13 @@
3721                 currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
3722
3723                 if (currspeed > sysctl_speed_limit_min) {
3724 -                       current->nice = 19;
3725 -
3726                         if ((currspeed > sysctl_speed_limit_max) ||
3727                                         !is_mddev_idle(mddev)) {
3728                                 current->state = TASK_INTERRUPTIBLE;
3729                                 md_schedule_timeout(HZ/4);
3730                                 goto repeat;
3731                         }
3732 -               } else
3733 -                       current->nice = -20;
3734 +               }
3735         }
3736         printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
3737         err = 0;
3738 --- linux/arch/i386/mm/fault.c.orig     Sun Jan  6 13:55:47 2002
3739 +++ linux/arch/i386/mm/fault.c  Sun Jan  6 13:56:25 2002
3740 @@ -88,8 +88,7 @@
3741
3742  out_of_memory:
3743         if (current->pid == 1) {
3744 -               current->policy |= SCHED_YIELD;
3745 -               schedule();
3746 +               yield();
3747                 goto survive;
3748         }
3749         goto bad_area;
3750 @@ -344,8 +343,7 @@
3751  out_of_memory:
3752         up_read(&mm->mmap_sem);
3753         if (tsk->pid == 1) {
3754 -               tsk->policy |= SCHED_YIELD;
3755 -               schedule();
3756 +               yield();
3757                 down_read(&mm->mmap_sem);
3758                 goto survive;
3759         }
3760 --- linux/arch/i386/kernel/smpboot.c.orig       Sun Jan  6 13:55:56 2002
3761 +++ linux/arch/i386/kernel/smpboot.c    Mon Jan 28 18:12:21 2002
3762 @@ -308,14 +308,14 @@
3763                         if (tsc_values[i] < avg)
3764                                 realdelta = -realdelta;
3765
3766 -                       printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n",
3767 -                               i, realdelta);
3768 +                       printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", i, realdelta);
3769                 }
3770
3771                 sum += delta;
3772         }
3773         if (!buggy)
3774                 printk("passed.\n");
3775 +               ;
3776  }
3777
3778  static void __init synchronize_tsc_ap (void)
3779 @@ -365,7 +365,7 @@
3780          * (This works even if the APIC is not enabled.)
3781          */
3782         phys_id = GET_APIC_ID(apic_read(APIC_ID));
3783 -       cpuid = current->processor;
3784 +       cpuid = cpu();
3785         if (test_and_set_bit(cpuid, &cpu_online_map)) {
3786                 printk("huh, phys CPU#%d, CPU#%d already present??\n",
3787                                         phys_id, cpuid);
3788 @@ -435,6 +435,7 @@
3789          */
3790         smp_store_cpu_info(cpuid);
3791
3792 +       disable_APIC_timer();
3793         /*
3794          * Allow the master to continue.
3795          */
3796 @@ -465,6 +466,7 @@
3797         smp_callin();
3798         while (!atomic_read(&smp_commenced))
3799                 rep_nop();
3800 +       enable_APIC_timer();
3801         /*
3802          * low-memory mappings have been cleared, flush them from
3803          * the local TLBs too.
3804 @@ -803,16 +805,13 @@
3805         if (!idle)
3806                 panic("No idle process for CPU %d", cpu);
3807
3808 -       idle->processor = cpu;
3809 -       idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */
3810 +       init_idle(idle, cpu);
3811
3812         map_cpu_to_boot_apicid(cpu, apicid);
3813
3814         idle->thread.eip = (unsigned long) start_secondary;
3815
3816 -       del_from_runqueue(idle);
3817         unhash_process(idle);
3818 -       init_tasks[cpu] = idle;
3819
3820         /* start_eip had better be page-aligned! */
3821         start_eip = setup_trampoline();
3822 @@ -925,6 +924,7 @@
3823  }
3824
3825  cycles_t cacheflush_time;
3826 +unsigned long cache_decay_ticks;
3827
3828  static void smp_tune_scheduling (void)
3829  {
3830 @@ -958,9 +958,13 @@
3831                 cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth;
3832         }
3833
3834 +       cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000;
3835 +
3836         printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
3837                 (long)cacheflush_time/(cpu_khz/1000),
3838                 ((long)cacheflush_time*100/(cpu_khz/1000)) % 100);
3839 +       printk("task migration cache decay timeout: %ld msecs.\n",
3840 +               (cache_decay_ticks + 1) * 1000 / HZ);
3841  }
3842
3843  /*
3844 @@ -1020,8 +1024,7 @@
3845         map_cpu_to_boot_apicid(0, boot_cpu_apicid);
3846
3847         global_irq_holder = 0;
3848 -       current->processor = 0;
3849 -       init_idle();
3850 +       current->cpu = 0;
3851         smp_tune_scheduling();
3852
3853         /*
3854 --- linux/arch/i386/kernel/process.c.orig       Mon Jan 28 18:09:58 2002
3855 +++ linux/arch/i386/kernel/process.c    Mon Jan 28 18:09:53 2002
3856 @@ -123,15 +123,12 @@
3857  void cpu_idle (void)
3858  {
3859         /* endless idle loop with no priority at all */
3860 -       init_idle();
3861 -       current->nice = 20;
3862 -       current->counter = -100;
3863
3864         while (1) {
3865                 void (*idle)(void) = pm_idle;
3866                 if (!idle)
3867                         idle = default_idle;
3868 -               while (!current->need_resched)
3869 +               if (!current->need_resched)
3870                         idle();
3871                 schedule();
3872                 check_pgt_cache();
3873 @@ -694,15 +691,17 @@
3874         asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
3875
3876         /*
3877 -        * Restore %fs and %gs.
3878 +        * Restore %fs and %gs if needed.
3879          */
3880 -       loadsegment(fs, next->fs);
3881 -       loadsegment(gs, next->gs);
3882 +       if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) {
3883 +               loadsegment(fs, next->fs);
3884 +               loadsegment(gs, next->gs);
3885 +       }
3886
3887         /*
3888          * Now maybe reload the debug registers
3889          */
3890 -       if (next->debugreg[7]){
3891 +       if (unlikely(next->debugreg[7])) {
3892                 loaddebug(next, 0);
3893                 loaddebug(next, 1);
3894                 loaddebug(next, 2);
3895 @@ -712,7 +711,7 @@
3896                 loaddebug(next, 7);
3897         }
3898
3899 -       if (prev->ioperm || next->ioperm) {
3900 +       if (unlikely(prev->ioperm || next->ioperm)) {
3901                 if (next->ioperm) {
3902                         /*
3903                          * 4 cachelines copy ... not good, but not that
3904 --- linux/arch/i386/kernel/apic.c.orig  Sun Jan  6 13:55:54 2002
3905 +++ linux/arch/i386/kernel/apic.c       Mon Jan 28 18:07:11 2002
3906 @@ -785,8 +785,7 @@
3907          */
3908
3909         slice = clocks / (smp_num_cpus+1);
3910 -       printk("cpu: %d, clocks: %d, slice: %d\n",
3911 -               smp_processor_id(), clocks, slice);
3912 +       printk("cpu: %d, clocks: %d, slice: %d\n", smp_processor_id(), clocks, slice);
3913
3914         /*
3915          * Wait for IRQ0's slice:
3916 @@ -809,8 +808,7 @@
3917
3918         __setup_APIC_LVTT(clocks);
3919
3920 -       printk("CPU%d<T0:%d,T1:%d,D:%d,S:%d,C:%d>\n",
3921 -                       smp_processor_id(), t0, t1, delta, slice, clocks);
3922 +       printk("CPU%d<T0:%d,T1:%d,D:%d,S:%d,C:%d>\n", smp_processor_id(), t0, t1, delta, slice, clocks);
3923
3924         __restore_flags(flags);
3925  }
3926 @@ -911,6 +909,26 @@
3927
3928         /* and update all other cpus */
3929         smp_call_function(setup_APIC_timer, (void *)calibration_result, 1, 1);
3930 +}
3931 +
3932 +void __init disable_APIC_timer(void)
3933 +{
3934 +       if (using_apic_timer) {
3935 +               unsigned long v;
3936 +
3937 +               v = apic_read(APIC_LVTT);
3938 +               apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
3939 +       }
3940 +}
3941 +
3942 +void enable_APIC_timer(void)
3943 +{
3944 +       if (using_apic_timer) {
3945 +               unsigned long v;
3946 +
3947 +               v = apic_read(APIC_LVTT);
3948 +               apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
3949 +       }
3950  }
3951
3952  /*
3953 --- linux/arch/i386/kernel/nmi.c.orig   Sun Jan  6 13:55:43 2002
3954 +++ linux/arch/i386/kernel/nmi.c        Sun Jan  6 13:56:25 2002
3955 @@ -283,7 +283,7 @@
3956                          * to get a message out.
3957                          */
3958                         bust_spinlocks(1);
3959 -                       printk("NMI Watchdog detected LOCKUP on CPU%d, registers:\n", cpu);
3960 +                       printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip);
3961                         show_registers(regs);
3962                         printk("console shuts up ...\n");
3963                         console_silent();
3964 --- linux/arch/i386/kernel/smp.c.orig   Sun Jan  6 13:55:56 2002
3965 +++ linux/arch/i386/kernel/smp.c        Wed Jan 16 21:42:45 2002
3966 @@ -105,7 +105,7 @@
3967  /* The 'big kernel lock' */
3968  spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
3969
3970 -struct tlb_state cpu_tlbstate[NR_CPUS] = {[0 ... NR_CPUS-1] = { &init_mm, 0 }};
3971 +struct tlb_state cpu_tlbstate[NR_CPUS] __cacheline_aligned = {[0 ... NR_CPUS-1] = { &init_mm, 0, }};
3972
3973  /*
3974   * the following functions deal with sending IPIs between CPUs.
3975 @@ -485,15 +485,54 @@
3976         do_flush_tlb_all_local();
3977  }
3978
3979 +static spinlock_t migration_lock = SPIN_LOCK_UNLOCKED;
3980 +static task_t *new_task;
3981 +
3982 +/*
3983 + * This function sends a 'task migration' IPI to another CPU.
3984 + * Must be called from syscall contexts, with interrupts *enabled*.
3985 + */
3986 +void smp_migrate_task(int cpu, task_t *p)
3987 +{
3988 +       /*
3989 +        * The target CPU will unlock the migration spinlock:
3990 +        */
3991 +       spin_lock(&migration_lock);
3992 +       new_task = p;
3993 +       send_IPI_mask(1 << cpu, TASK_MIGRATION_VECTOR);
3994 +}
3995 +
3996 +/*
3997 + * Task migration callback.
3998 + */
3999 +asmlinkage void smp_task_migration_interrupt(void)
4000 +{
4001 +       task_t *p;
4002 +
4003 +       ack_APIC_irq();
4004 +       p = new_task;
4005 +       spin_unlock(&migration_lock);
4006 +       sched_task_migrated(p);
4007 +}
4008  /*
4009   * this function sends a 'reschedule' IPI to another CPU.
4010   * it goes straight through and wastes no time serializing
4011   * anything. Worst case is that we lose a reschedule ...
4012   */
4013 -
4014  void smp_send_reschedule(int cpu)
4015  {
4016         send_IPI_mask(1 << cpu, RESCHEDULE_VECTOR);
4017 +}
4018 +
4019 +/*
4020 + * this function sends a reschedule IPI to all (other) CPUs.
4021 + * This should only be used if some 'global' task became runnable,
4022 + * such as a RT task, that must be handled now. The first CPU
4023 + * that manages to grab the task will run it.
4024 + */
4025 +void smp_send_reschedule_all(void)
4026 +{
4027 +       send_IPI_allbutself(RESCHEDULE_VECTOR);
4028  }
4029
4030  /*
4031 --- linux/arch/i386/kernel/i8259.c.orig Wed Jan 16 21:43:09 2002
4032 +++ linux/arch/i386/kernel/i8259.c      Fri Jan 18 15:36:35 2002
4033 @@ -79,6 +79,7 @@
4034   * through the ICC by us (IPIs)
4035   */
4036  #ifdef CONFIG_SMP
4037 +BUILD_SMP_INTERRUPT(task_migration_interrupt,TASK_MIGRATION_VECTOR)
4038  BUILD_SMP_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
4039  BUILD_SMP_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
4040  BUILD_SMP_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
4041 @@ -472,6 +473,9 @@
4042          * IPI, driven by wakeup.
4043          */
4044         set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
4045 +
4046 +       /* IPI for task migration */
4047 +       set_intr_gate(TASK_MIGRATION_VECTOR, task_migration_interrupt);
4048
4049         /* IPI for invalidation */
4050         set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
4051 --- linux/arch/i386/kernel/entry.S.orig Fri Jan 25 14:30:36 2002
4052 +++ linux/arch/i386/kernel/entry.S      Fri Jan 25 14:30:50 2002
4053 @@ -77,7 +77,7 @@
4054  exec_domain    = 16
4055  need_resched   = 20
4056  tsk_ptrace     = 24
4057 -processor      = 52
4058 +cpu            = 32
4059
4060  ENOSYS = 38
4061
4062 --- linux/arch/i386/kernel/setup.c.orig Mon Jan 28 18:10:23 2002
4063 +++ linux/arch/i386/kernel/setup.c      Mon Jan 28 18:10:48 2002
4064 @@ -2922,9 +2922,10 @@
4065         load_TR(nr);
4066         load_LDT(&init_mm);
4067
4068 -       /*
4069 -        * Clear all 6 debug registers:
4070 -        */
4071 +       /* Clear %fs and %gs. */
4072 +       asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
4073 +
4074 +       /* Clear all 6 debug registers: */
4075
4076  #define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) );
4077