sched-O1-2.4.17-K2.patch

   1 --- linux/fs/proc/proc_misc.c.orig      Sun Jan  6 13:55:55 2002
   2 +++ linux/fs/proc/proc_misc.c   Mon Feb  4 04:09:18 2002
   3 @@ -85,11 +85,11 @@
   4         a = avenrun[0] + (FIXED_1/200);
   5         b = avenrun[1] + (FIXED_1/200);
   6         c = avenrun[2] + (FIXED_1/200);
   7 -       len = sprintf(page,"%d.%02d %d.%02d %d.%02d %d/%d %d\n",
   8 +       len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
   9                 LOAD_INT(a), LOAD_FRAC(a),
  10                 LOAD_INT(b), LOAD_FRAC(b),
  11                 LOAD_INT(c), LOAD_FRAC(c),
  12 -               nr_running, nr_threads, last_pid);
  13 +               nr_running(), nr_threads, last_pid);
  14         return proc_calc_metrics(page, start, off, count, eof, len);
  15  }
  16
  17 @@ -101,7 +101,7 @@
  18         int len;
  19
  20         uptime = jiffies;
  21 -       idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;
  22 +       idle = init_task.times.tms_utime + init_task.times.tms_stime;
  23
  24         /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
  25            that would overflow about every five days at HZ == 100.
  26 @@ -303,10 +303,10 @@
  27         }
  28
  29         len += sprintf(page + len,
  30 -               "\nctxt %u\n"
  31 +               "\nctxt %lu\n"
  32                 "btime %lu\n"
  33                 "processes %lu\n",
  34 -               kstat.context_swtch,
  35 +               nr_context_switches(),
  36                 xtime.tv_sec - jif / HZ,
  37                 total_forks);
  38
  39 --- linux/fs/proc/array.c.orig  Sun Jan  6 13:55:51 2002
  40 +++ linux/fs/proc/array.c       Mon Feb  4 04:09:18 2002
  41 @@ -335,9 +335,8 @@
  42
  43         /* scale priority and nice values from timeslices to -20..20 */
  44         /* to make it look like a "normal" Unix priority/nice value  */
  45 -       priority = task->counter;
  46 -       priority = 20 - (priority * 10 + DEF_COUNTER / 2) / DEF_COUNTER;
  47 -       nice = task->nice;
  48 +       priority = task_prio(task);
  49 +       nice = task_nice(task);
  50
  51         read_lock(&tasklist_lock);
  52         ppid = task->pid ? task->p_opptr->pid : 0;
  53 @@ -387,7 +386,7 @@
  54                 task->nswap,
  55                 task->cnswap,
  56                 task->exit_signal,
  57 -               task->processor);
  58 +               task->cpu);
  59         if(mm)
  60                 mmput(mm);
  61         return res;
  62 --- linux/fs/nfs/pagelist.c.orig        Sun Jan  6 13:55:57 2002
  63 +++ linux/fs/nfs/pagelist.c     Mon Feb  4 04:09:18 2002
  64 @@ -96,8 +96,7 @@
  65                         continue;
  66                 if (signalled() && (server->flags & NFS_MOUNT_INTR))
  67                         return ERR_PTR(-ERESTARTSYS);
  68 -               current->policy = SCHED_YIELD;
  69 -               schedule();
  70 +               yield();
  71         }
  72
  73         /* Initialize the request struct. Initially, we assume a
  74 --- linux/fs/ufs/truncate.c.orig        Sun Jan  6 13:55:55 2002
  75 +++ linux/fs/ufs/truncate.c     Mon Feb  4 04:09:18 2002
  76 @@ -448,10 +448,7 @@
  77                 if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
  78                         ufs_sync_inode (inode);
  79                 run_task_queue(&tq_disk);
  80 -               current->policy |= SCHED_YIELD;
  81 -               schedule ();
  82 -
  83 -
  84 +               yield();
  85         }
  86         offset = inode->i_size & uspi->s_fshift;
  87         if (offset) {
  88 --- linux/fs/reiserfs/buffer2.c.orig    Sun Jan  6 13:55:57 2002
  89 +++ linux/fs/reiserfs/buffer2.c Mon Feb  4 04:09:18 2002
  90 @@ -33,8 +33,7 @@
  91                         buffer_journal_dirty(bh) ? ' ' : '!');
  92      }
  93      run_task_queue(&tq_disk);
  94 -    current->policy |= SCHED_YIELD;
  95 -    schedule();
  96 +    yield();
  97    }
  98    if (repeat_counter > 30000000) {
  99      reiserfs_warning("vs-3051: done waiting, ignore vs-3050 messages for (%b)\n", bh) ;
 100 @@ -52,11 +51,11 @@
 101  struct buffer_head  * reiserfs_bread (struct super_block *super, int n_block, int n_size)
 102  {
 103      struct buffer_head  *result;
 104 -    PROC_EXP( unsigned int ctx_switches = kstat.context_swtch );
 105 +    PROC_EXP( unsigned int ctx_switches = nr_context_switches(); );
 106
 107      result = bread (super -> s_dev, n_block, n_size);
 108      PROC_INFO_INC( super, breads );
 109 -    PROC_EXP( if( kstat.context_swtch != ctx_switches )
 110 +    PROC_EXP( if( nr_context_switches() != ctx_switches )
 111               PROC_INFO_INC( super, bread_miss ) );
 112      return result;
 113  }
 114 --- linux/fs/reiserfs/journal.c.orig    Sun Jan  6 13:55:57 2002
 115 +++ linux/fs/reiserfs/journal.c Mon Feb  4 04:09:18 2002
 116 @@ -149,8 +149,7 @@
 117    }
 118    bn = allocate_bitmap_node(p_s_sb) ;
 119    if (!bn) {
 120 -    current->policy |= SCHED_YIELD ;
 121 -    schedule() ;
 122 +    yield();
 123      goto repeat ;
 124    }
 125    return bn ;
 126 --- linux/fs/jffs2/background.c.orig    Sun Jan  6 13:55:53 2002
 127 +++ linux/fs/jffs2/background.c Mon Feb  4 04:09:18 2002
 128 @@ -106,9 +106,6 @@
 129
 130          sprintf(current->comm, "jffs2_gcd_mtd%d", c->mtd->index);
 131
 132 -       /* FIXME in the 2.2 backport */
 133 -       current->nice = 10;
 134 -
 135         for (;;) {
 136                 spin_lock_irq(&current->sigmask_lock);
 137                 siginitsetinv (&current->blocked, sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT));
 138 --- linux/fs/jbd/journal.c.orig Sun Jan  6 13:55:57 2002
 139 +++ linux/fs/jbd/journal.c      Mon Feb  4 04:09:18 2002
 140 @@ -460,8 +460,7 @@
 141                         printk (KERN_NOTICE __FUNCTION__
 142                                 ": ENOMEM at get_unused_buffer_head, "
 143                                 "trying again.\n");
 144 -                       current->policy |= SCHED_YIELD;
 145 -                       schedule();
 146 +                       yield();
 147                 }
 148         } while (!new_bh);
 149         /* keep subsequent assertions sane */
 150 @@ -1539,8 +1538,7 @@
 151                         last_warning = jiffies;
 152                 }
 153
 154 -               current->policy |= SCHED_YIELD;
 155 -               schedule();
 156 +               yield();
 157         }
 158  }
 159
 160 @@ -1598,8 +1596,7 @@
 161                         last_warning = jiffies;
 162                 }
 163                 while (ret == 0) {
 164 -                       current->policy |= SCHED_YIELD;
 165 -                       schedule();
 166 +                       yield();
 167                         ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
 168                 }
 169         }
 170 --- linux/fs/jbd/revoke.c.orig  Sun Jan  6 13:55:57 2002
 171 +++ linux/fs/jbd/revoke.c       Mon Feb  4 04:09:18 2002
 172 @@ -137,8 +137,7 @@
 173         if (!journal_oom_retry)
 174                 return -ENOMEM;
 175         jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n");
 176 -       current->policy |= SCHED_YIELD;
 177 -       schedule();
 178 +       yield();
 179         goto repeat;
 180  }
 181
 182 --- linux/fs/jbd/transaction.c.orig     Sun Jan  6 13:55:57 2002
 183 +++ linux/fs/jbd/transaction.c  Mon Feb  4 04:09:18 2002
 184 @@ -1377,8 +1377,7 @@
 185                 do {
 186                         old_handle_count = transaction->t_handle_count;
 187                         set_current_state(TASK_RUNNING);
 188 -                       current->policy |= SCHED_YIELD;
 189 -                       schedule();
 190 +                       yield();
 191                 } while (old_handle_count != transaction->t_handle_count);
 192         }
 193
 194 --- linux/fs/binfmt_elf.c.orig  Sun Jan  6 13:55:57 2002
 195 +++ linux/fs/binfmt_elf.c       Mon Feb  4 04:09:18 2002
 196 @@ -1143,7 +1143,7 @@
 197         psinfo.pr_state = i;
 198         psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i];
 199         psinfo.pr_zomb = psinfo.pr_sname == 'Z';
 200 -       psinfo.pr_nice = current->nice;
 201 +       psinfo.pr_nice = task_nice(current);
 202         psinfo.pr_flag = current->flags;
 203         psinfo.pr_uid = NEW_TO_OLD_UID(current->uid);
 204         psinfo.pr_gid = NEW_TO_OLD_GID(current->gid);
 205 --- linux/fs/buffer.c.orig      Sun Jan  6 13:55:57 2002
 206 +++ linux/fs/buffer.c   Mon Feb  4 04:09:18 2002
 207 @@ -725,9 +725,8 @@
 208         wakeup_bdflush();
 209         try_to_free_pages(zone, GFP_NOFS, 0);
 210         run_task_queue(&tq_disk);
 211 -       current->policy |= SCHED_YIELD;
 212         __set_current_state(TASK_RUNNING);
 213 -       schedule();
 214 +       sys_sched_yield();
 215  }
 216
 217  void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 218 --- linux/fs/locks.c.orig       Sun Jan  6 13:55:51 2002
 219 +++ linux/fs/locks.c    Mon Feb  4 04:09:19 2002
 220 @@ -445,8 +445,7 @@
 221                         /* Let the blocked process remove waiter from the
 222                          * block list when it gets scheduled.
 223                          */
 224 -                       current->policy |= SCHED_YIELD;
 225 -                       schedule();
 226 +                       yield();
 227                 } else {
 228                         /* Remove waiter from the block list, because by the
 229                          * time it wakes up blocker won't exist any more.
 230 --- linux/init/main.c.orig      Sun Jan  6 13:55:57 2002
 231 +++ linux/init/main.c   Mon Feb  4 04:09:19 2002
 232 @@ -482,8 +482,6 @@
 233  extern void setup_arch(char **);
 234  extern void cpu_idle(void);
 235
 236 -unsigned long wait_init_idle;
 237 -
 238  #ifndef CONFIG_SMP
 239
 240  #ifdef CONFIG_X86_LOCAL_APIC
 241 @@ -492,34 +490,24 @@
 242         APIC_init_uniprocessor();
 243  }
 244  #else
 245 -#define smp_init()     do { } while (0)
 246 +#define smp_init()      do { } while (0)
 247  #endif
 248
 249  #else
 250
 251 -
 252  /* Called by boot processor to activate the rest. */
 253  static void __init smp_init(void)
 254  {
 255         /* Get other processors into their bootup holding patterns. */
 256         smp_boot_cpus();
 257 -       wait_init_idle = cpu_online_map;
 258 -       clear_bit(current->processor, &wait_init_idle); /* Don't wait on me! */
 259
 260         smp_threads_ready=1;
 261         smp_commence();
 262 -
 263 -       /* Wait for the other cpus to set up their idle processes */
 264 -       printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle);
 265 -       while (wait_init_idle) {
 266 -               cpu_relax();
 267 -               barrier();
 268 -       }
 269 -       printk("All processors have done init_idle\n");
 270  }
 271
 272  #endif
 273
 274 +
 275  /*
 276   * We need to finalize in a non-__init function or else race conditions
 277   * between the root thread and the init thread may cause start_kernel to
 278 @@ -531,9 +519,8 @@
 279  {
 280         kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
 281         unlock_kernel();
 282 -       current->need_resched = 1;
 283 -       cpu_idle();
 284 -}
 285 +       cpu_idle();
 286 +}
 287
 288  /*
 289   *     Activate the first processor.
 290 @@ -611,14 +598,18 @@
 291         ipc_init();
 292  #endif
 293         check_bugs();
 294 +
 295         printk("POSIX conformance testing by UNIFIX\n");
 296
 297 -       /*
 298 -        *      We count on the initial thread going ok
 299 -        *      Like idlers init is an unlocked kernel thread, which will
 300 -        *      make syscalls (and thus be locked).
 301 +       init_idle(current, smp_processor_id());
 302 +       /*
 303 +        *      We count on the initial thread going ok
 304 +        *      Like idlers init is an unlocked kernel thread, which will
 305 +        *      make syscalls (and thus be locked).
 306          */
 307         smp_init();
 308 +
 309 +       /* Do the rest non-__init'ed, we're now alive */
 310         rest_init();
 311  }
 312
 313 @@ -779,12 +770,9 @@
 314                 int i, pid;
 315
 316                 pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD);
 317 -               if (pid > 0) {
 318 -                       while (pid != wait(&i)) {
 319 -                               current->policy |= SCHED_YIELD;
 320 -                               schedule();
 321 -                       }
 322 -               }
 323 +               if (pid > 0)
 324 +                       while (pid != wait(&i))
 325 +                               yield();
 326                 if (MAJOR(real_root_dev) != RAMDISK_MAJOR
 327                      || MINOR(real_root_dev) != 0) {
 328                         error = change_root(real_root_dev,"/initrd");
 329 --- linux/kernel/sched.c.orig   Sun Jan  6 13:55:57 2002
 330 +++ linux/kernel/sched.c        Mon Feb  4 04:09:19 2002
 331 @@ -12,333 +12,306 @@
 332   *  1998-12-28  Implemented better SMP scheduling by Ingo Molnar
 333   */
 334
 335 -/*
 336 - * 'sched.c' is the main kernel file. It contains scheduling primitives
 337 - * (sleep_on, wakeup, schedule etc) as well as a number of simple system
 338 - * call functions (type getpid()), which just extract a field from
 339 - * current-task
 340 - */
 341 -
 342 -#include <linux/config.h>
 343  #include <linux/mm.h>
 344 +#include <linux/nmi.h>
 345  #include <linux/init.h>
 346 +#include <asm/uaccess.h>
 347  #include <linux/smp_lock.h>
 348 -#include <linux/nmi.h>
 349  #include <linux/interrupt.h>
 350 -#include <linux/kernel_stat.h>
 351 -#include <linux/completion.h>
 352 -#include <linux/prefetch.h>
 353 -#include <linux/compiler.h>
 354 -
 355 -#include <asm/uaccess.h>
 356  #include <asm/mmu_context.h>
 357 -
 358 -extern void timer_bh(void);
 359 -extern void tqueue_bh(void);
 360 -extern void immediate_bh(void);
 361 +#include <linux/kernel_stat.h>
 362
 363  /*
 364 - * scheduler variables
 365 + * Priority of a process goes from 0 to 139. The 0-99
 366 + * priority range is allocated to RT tasks, the 100-139
 367 + * range is for SCHED_OTHER tasks. Priority values are
 368 + * inverted: lower p->prio value means higher priority.
 369   */
 370 -
 371 -unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
 372 -
 373 -extern void mem_use(void);
 374 +#define MAX_RT_PRIO            100
 375 +#define MAX_PRIO               (MAX_RT_PRIO + 40)
 376
 377  /*
 378 - * Scheduling quanta.
 379 - *
 380 - * NOTE! The unix "nice" value influences how long a process
 381 - * gets. The nice value ranges from -20 to +19, where a -20
 382 - * is a "high-priority" task, and a "+10" is a low-priority
 383 - * task.
 384 - *
 385 - * We want the time-slice to be around 50ms or so, so this
 386 - * calculation depends on the value of HZ.
 387 + * Convert user-nice values [ -20 ... 0 ... 19 ]
 388 + * to static priority [ 100 ... 139 (MAX_PRIO-1) ],
 389 + * and back.
 390   */
 391 -#if HZ < 200
 392 -#define TICK_SCALE(x)  ((x) >> 2)
 393 -#elif HZ < 400
 394 -#define TICK_SCALE(x)  ((x) >> 1)
 395 -#elif HZ < 800
 396 -#define TICK_SCALE(x)  (x)
 397 -#elif HZ < 1600
 398 -#define TICK_SCALE(x)  ((x) << 1)
 399 -#else
 400 -#define TICK_SCALE(x)  ((x) << 2)
 401 -#endif
 402 -
 403 -#define NICE_TO_TICKS(nice)    (TICK_SCALE(20-(nice))+1)
 404 +#define NICE_TO_PRIO(nice)     (MAX_RT_PRIO + (nice) + 20)
 405 +#define PRIO_TO_NICE(prio)     ((prio) - MAX_RT_PRIO - 20)
 406 +#define TASK_NICE(p)           PRIO_TO_NICE((p)->static_prio)
 407
 408 +/*
 409 + * 'User priority' is the nice value converted to something we
 410 + * can work with better when scaling various scheduler parameters,
 411 + * it's a [ 0 ... 39 ] range.
 412 + */
 413 +#define USER_PRIO(p)           ((p)-MAX_RT_PRIO)
 414 +#define TASK_USER_PRIO(p)      USER_PRIO((p)->static_prio)
 415 +#define MAX_USER_PRIO          (USER_PRIO(MAX_PRIO))
 416
 417  /*
 418 - *     Init task must be ok at boot for the ix86 as we will check its signals
 419 - *     via the SMP irq return path.
 420 + * These are the 'tuning knobs' of the scheduler:
 421 + *
 422 + * Minimum timeslice is 10 msecs, default timeslice is 150 msecs,
 423 + * maximum timeslice is 300 msecs. Timeslices get refilled after
 424 + * they expire.
 425   */
 426 -
 427 -struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
 428 +#define MIN_TIMESLICE          ( 10 * HZ / 1000)
 429 +#define MAX_TIMESLICE          (300 * HZ / 1000)
 430 +#define CHILD_PENALTY          95
 431 +#define PARENT_PENALTY         100
 432 +#define EXIT_WEIGHT            3
 433 +#define PRIO_BONUS_RATIO       25
 434 +#define INTERACTIVE_DELTA      2
 435 +#define MAX_SLEEP_AVG          (2*HZ)
 436 +#define STARVATION_LIMIT       (2*HZ)
 437
 438  /*
 439 - * The tasklist_lock protects the linked list of processes.
 440 + * If a task is 'interactive' then we reinsert it in the active
 441 + * array after it has expired its current timeslice. (it will not
 442 + * continue to run immediately, it will still roundrobin with
 443 + * other interactive tasks.)
 444   *
 445 - * The runqueue_lock locks the parts that actually access
 446 - * and change the run-queues, and have to be interrupt-safe.
 447 + * This part scales the interactivity limit depending on niceness.
 448   *
 449 - * If both locks are to be concurrently held, the runqueue_lock
 450 - * nests inside the tasklist_lock.
 451 + * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
 452 + * Here are a few examples of different nice levels:
 453   *
 454 - * task->alloc_lock nests inside tasklist_lock.
 455 + *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
 456 + *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
 457 + *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
 458 + *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
 459 + *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
 460 + *
 461 + * (the X axis represents the possible -5 ... 0 ... +5 dynamic
 462 + *  priority range a task can explore, a value of '1' means the
 463 + *  task is rated interactive.)
 464 + *
 465 + * Ie. nice +19 tasks can never get 'interactive' enough to be
 466 + * reinserted into the active array. And only heavily CPU-hog nice -20
 467 + * tasks will be expired. Default nice 0 tasks are somewhere between,
 468 + * it takes some effort for them to get interactive, but it's not
 469 + * too hard.
 470   */
 471 -spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* inner */
 472 -rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */
 473
 474 -static LIST_HEAD(runqueue_head);
 475 +#define SCALE(v1,v1_max,v2_max) \
 476 +       (v1) * (v2_max) / (v1_max)
 477
 478 -/*
 479 - * We align per-CPU scheduling data on cacheline boundaries,
 480 - * to prevent cacheline ping-pong.
 481 - */
 482 -static union {
 483 -       struct schedule_data {
 484 -               struct task_struct * curr;
 485 -               cycles_t last_schedule;
 486 -       } schedule_data;
 487 -       char __pad [SMP_CACHE_BYTES];
 488 -} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
 489 +#define DELTA(p) \
 490 +       (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
 491 +               INTERACTIVE_DELTA)
 492
 493 -#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
 494 -#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
 495 +#define TASK_INTERACTIVE(p) \
 496 +       ((p)->prio <= (p)->static_prio - DELTA(p))
 497
 498 -struct kernel_stat kstat;
 499 -extern struct task_struct *child_reaper;
 500 +/*
 501 + * TASK_TIMESLICE scales user-nice values [ -20 ... 19 ]
 502 + * to time slice values.
 503 + *
 504 + * The higher a process's priority, the bigger timeslices
 505 + * it gets during one round of execution. But even the lowest
 506 + * priority process gets MIN_TIMESLICE worth of execution time.
 507 + */
 508
 509 -#ifdef CONFIG_SMP
 510 +#define TASK_TIMESLICE(p) (MIN_TIMESLICE + \
 511 +       ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/39))
 512
 513 -#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
 514 -#define can_schedule(p,cpu) \
 515 -       ((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu))
 516 +/*
 517 + * These are the runqueue data structures:
 518 + */
 519
 520 -#else
 521 +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
 522
 523 -#define idle_task(cpu) (&init_task)
 524 -#define can_schedule(p,cpu) (1)
 525 +typedef struct runqueue runqueue_t;
 526
 527 -#endif
 528 -
 529 -void scheduling_functions_start_here(void) { }
 530 +struct prio_array {
 531 +       int nr_active;
 532 +       spinlock_t *lock;
 533 +       runqueue_t *rq;
 534 +       unsigned long bitmap[BITMAP_SIZE];
 535 +       list_t queue[MAX_PRIO];
 536 +};
 537
 538  /*
 539 - * This is the function that decides how desirable a process is..
 540 - * You can weigh different processes against each other depending
 541 - * on what CPU they've run on lately etc to try to handle cache
 542 - * and TLB miss penalties.
 543 + * This is the main, per-CPU runqueue data structure.
 544   *
 545 - * Return values:
 546 - *      -1000: never select this
 547 - *          0: out of time, recalculate counters (but it might still be
 548 - *             selected)
 549 - *        +ve: "goodness" value (the larger, the better)
 550 - *      +1000: realtime process, select this.
 551 + * Locking rule: those places that want to lock multiple runqueues
 552 + * (such as the load balancing or the process migration code), lock
 553 + * acquire operations must be ordered by ascending &runqueue.
 554   */
 555 +struct runqueue {
 556 +       spinlock_t lock;
 557 +       unsigned long nr_running, nr_switches, expired_timestamp;
 558 +       task_t *curr, *idle;
 559 +       prio_array_t *active, *expired, arrays[2];
 560 +       int prev_nr_running[NR_CPUS];
 561 +} ____cacheline_aligned;
 562
 563 -static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
 564 -{
 565 -       int weight;
 566 +static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
 567
 568 -       /*
 569 -        * select the current process after every other
 570 -        * runnable process, but before the idle thread.
 571 -        * Also, dont trigger a counter recalculation.
 572 -        */
 573 -       weight = -1;
 574 -       if (p->policy & SCHED_YIELD)
 575 -               goto out;
 576 +#define cpu_rq(cpu)            (runqueues + (cpu))
 577 +#define this_rq()              cpu_rq(smp_processor_id())
 578 +#define task_rq(p)             cpu_rq((p)->cpu)
 579 +#define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
 580 +#define rt_task(p)             ((p)->prio < MAX_RT_PRIO)
 581
 582 -       /*
 583 -        * Non-RT process - normal case first.
 584 -        */
 585 -       if (p->policy == SCHED_OTHER) {
 586 -               /*
 587 -                * Give the process a first-approximation goodness value
 588 -                * according to the number of clock-ticks it has left.
 589 -                *
 590 -                * Don't do any other calculations if the time slice is
 591 -                * over..
 592 -                */
 593 -               weight = p->counter;
 594 -               if (!weight)
 595 -                       goto out;
 596 -
 597 -#ifdef CONFIG_SMP
 598 -               /* Give a largish advantage to the same processor...   */
 599 -               /* (this is equivalent to penalizing other processors) */
 600 -               if (p->processor == this_cpu)
 601 -                       weight += PROC_CHANGE_PENALTY;
 602 -#endif
 603 +static inline runqueue_t *lock_task_rq(task_t *p, unsigned long *flags)
 604 +{
 605 +       struct runqueue *__rq;
 606
 607 -               /* .. and a slight advantage to the current MM */
 608 -               if (p->mm == this_mm || !p->mm)
 609 -                       weight += 1;
 610 -               weight += 20 - p->nice;
 611 -               goto out;
 612 +repeat_lock_task:
 613 +       __rq = task_rq(p);
 614 +       spin_lock_irqsave(&__rq->lock, *flags);
 615 +       if (unlikely(__rq != task_rq(p))) {
 616 +               spin_unlock_irqrestore(&__rq->lock, *flags);
 617 +               goto repeat_lock_task;
 618         }
 619 +       return __rq;
 620 +}
 621
 622 -       /*
 623 -        * Realtime process, select the first one on the
 624 -        * runqueue (taking priorities within processes
 625 -        * into account).
 626 -        */
 627 -       weight = 1000 + p->rt_priority;
 628 -out:
 629 -       return weight;
 630 +static inline void unlock_task_rq(runqueue_t *rq, unsigned long *flags)
 631 +{
 632 +       spin_unlock_irqrestore(&rq->lock, *flags);
 633  }
 634
 635  /*
 636 - * the 'goodness value' of replacing a process on a given CPU.
 637 - * positive value means 'replace', zero or negative means 'dont'.
 638 + * Adding/removing a task to/from a priority array:
 639   */
 640 -static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
 641 +static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
 642  {
 643 -       return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
 644 +       array->nr_active--;
 645 +       list_del_init(&p->run_list);
 646 +       if (list_empty(array->queue + p->prio))
 647 +               __clear_bit(p->prio, array->bitmap);
 648  }
 649
 650 -/*
 651 - * This is ugly, but reschedule_idle() is very timing-critical.
 652 - * We are called with the runqueue spinlock held and we must
 653 - * not claim the tasklist_lock.
 654 - */
 655 -static FASTCALL(void reschedule_idle(struct task_struct * p));
 656 +static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
 657 +{
 658 +       list_add_tail(&p->run_list, array->queue + p->prio);
 659 +       __set_bit(p->prio, array->bitmap);
 660 +       array->nr_active++;
 661 +       p->array = array;
 662 +}
 663
 664 -static void reschedule_idle(struct task_struct * p)
 665 +static inline int effective_prio(task_t *p)
 666  {
 667 -#ifdef CONFIG_SMP
 668 -       int this_cpu = smp_processor_id();
 669 -       struct task_struct *tsk, *target_tsk;
 670 -       int cpu, best_cpu, i, max_prio;
 671 -       cycles_t oldest_idle;
 672 -
 673 -       /*
 674 -        * shortcut if the woken up task's last CPU is
 675 -        * idle now.
 676 -        */
 677 -       best_cpu = p->processor;
 678 -       if (can_schedule(p, best_cpu)) {
 679 -               tsk = idle_task(best_cpu);
 680 -               if (cpu_curr(best_cpu) == tsk) {
 681 -                       int need_resched;
 682 -send_now_idle:
 683 -                       /*
 684 -                        * If need_resched == -1 then we can skip sending
 685 -                        * the IPI altogether, tsk->need_resched is
 686 -                        * actively watched by the idle thread.
 687 -                        */
 688 -                       need_resched = tsk->need_resched;
 689 -                       tsk->need_resched = 1;
 690 -                       if ((best_cpu != this_cpu) && !need_resched)
 691 -                               smp_send_reschedule(best_cpu);
 692 -                       return;
 693 -               }
 694 -       }
 695 +       int bonus, prio;
 696
 697         /*
 698 -        * We know that the preferred CPU has a cache-affine current
 699 -        * process, lets try to find a new idle CPU for the woken-up
 700 -        * process. Select the least recently active idle CPU. (that
 701 -        * one will have the least active cache context.) Also find
 702 -        * the executing process which has the least priority.
 703 -        */
 704 -       oldest_idle = (cycles_t) -1;
 705 -       target_tsk = NULL;
 706 -       max_prio = 0;
 707 +        * Here we scale the actual sleep average [0 .... MAX_SLEEP_AVG]
 708 +        * into the -5 ... 0 ... +5 bonus/penalty range.
 709 +        *
 710 +        * We use 25% of the full 0...39 priority range so that:
 711 +        *
 712 +        * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
 713 +        * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
 714 +        *
 715 +        * Both properties are important to certain workloads.
 716 +        */
 717 +       bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
 718 +                       MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
 719
 720 -       for (i = 0; i < smp_num_cpus; i++) {
 721 -               cpu = cpu_logical_map(i);
 722 -               if (!can_schedule(p, cpu))
 723 -                       continue;
 724 -               tsk = cpu_curr(cpu);
 725 +       prio = p->static_prio - bonus;
 726 +       if (prio < MAX_RT_PRIO)
 727 +               prio = MAX_RT_PRIO;
 728 +       if (prio > MAX_PRIO-1)
 729 +               prio = MAX_PRIO-1;
 730 +       return prio;
 731 +}
 732 +
 733 +static inline void activate_task(task_t *p, runqueue_t *rq)
 734 +{
 735 +       unsigned long sleep_time = jiffies - p->sleep_timestamp;
 736 +       prio_array_t *array = rq->active;
 737 +
 738 +       if (!rt_task(p) && sleep_time) {
 739                 /*
 740 -                * We use the first available idle CPU. This creates
 741 -                * a priority list between idle CPUs, but this is not
 742 -                * a problem.
 743 +                * This code gives a bonus to interactive tasks. We update
 744 +                * an 'average sleep time' value here, based on
 745 +                * sleep_timestamp. The more time a task spends sleeping,
 746 +                * the higher the average gets - and the higher the priority
 747 +                * boost gets as well.
 748                  */
 749 -               if (tsk == idle_task(cpu)) {
 750 -#if defined(__i386__) && defined(CONFIG_SMP)
 751 -                        /*
 752 -                        * Check if two siblings are idle in the same
 753 -                        * physical package. Use them if found.
 754 -                        */
 755 -                       if (smp_num_siblings == 2) {
 756 -                               if (cpu_curr(cpu_sibling_map[cpu]) ==
 757 -                                   idle_task(cpu_sibling_map[cpu])) {
 758 -                                       oldest_idle = last_schedule(cpu);
 759 -                                       target_tsk = tsk;
 760 -                                       break;
 761 -                               }
 762 -
 763 -                        }
 764 -#endif
 765 -                       if (last_schedule(cpu) < oldest_idle) {
 766 -                               oldest_idle = last_schedule(cpu);
 767 -                               target_tsk = tsk;
 768 -                       }
 769 -               } else {
 770 -                       if (oldest_idle == -1ULL) {
 771 -                               int prio = preemption_goodness(tsk, p, cpu);
 772 -
 773 -                               if (prio > max_prio) {
 774 -                                       max_prio = prio;
 775 -                                       target_tsk = tsk;
 776 -                               }
 777 -                       }
 778 -               }
 779 -       }
 780 -       tsk = target_tsk;
 781 -       if (tsk) {
 782 -               if (oldest_idle != -1ULL) {
 783 -                       best_cpu = tsk->processor;
 784 -                       goto send_now_idle;
 785 -               }
 786 -               tsk->need_resched = 1;
 787 -               if (tsk->processor != this_cpu)
 788 -                       smp_send_reschedule(tsk->processor);
 789 +               p->sleep_avg += sleep_time;
 790 +               if (p->sleep_avg > MAX_SLEEP_AVG)
 791 +                       p->sleep_avg = MAX_SLEEP_AVG;
 792 +               p->prio = effective_prio(p);
 793         }
 794 -       return;
 795 -
 796 +       enqueue_task(p, array);
 797 +       rq->nr_running++;
 798 +}
 799
 800 -#else /* UP */
 801 -       int this_cpu = smp_processor_id();
 802 -       struct task_struct *tsk;
 803 -
 804 -       tsk = cpu_curr(this_cpu);
 805 -       if (preemption_goodness(tsk, p, this_cpu) > 0)
 806 -               tsk->need_resched = 1;
 807 -#endif
 808 +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
 809 +{
 810 +       rq->nr_running--;
 811 +       dequeue_task(p, p->array);
 812 +       p->array = NULL;
 813  }
 814
 815 +static inline void resched_task(task_t *p)
 816 +{
 817 +       int need_resched;
 818 +
 819 +       need_resched = p->need_resched;
 820 +       wmb();
 821 +       p->need_resched = 1;
 822 +       if (!need_resched && (p->cpu != smp_processor_id()))
 823 +               smp_send_reschedule(p->cpu);
 824 +}
 825 +
 826 +#ifdef CONFIG_SMP
 827 +
 828  /*
 829 - * Careful!
 830 - *
 831 - * This has to add the process to the _beginning_ of the
 832 - * run-queue, not the end. See the comment about "This is
 833 - * subtle" in the scheduler proper..
 834 + * Wait for a process to unschedule. This is used by the exit() and
 835 + * ptrace() code.
 836   */
 837 -static inline void add_to_runqueue(struct task_struct * p)
 838 +void wait_task_inactive(task_t * p)
 839  {
 840 -       list_add(&p->run_list, &runqueue_head);
 841 -       nr_running++;
 842 +       unsigned long flags;
 843 +       runqueue_t *rq;
 844 +
 845 +repeat:
 846 +       rq = task_rq(p);
 847 +       while (unlikely(rq->curr == p)) {
 848 +               cpu_relax();
 849 +               barrier();
 850 +       }
 851 +       rq = lock_task_rq(p, &flags);
 852 +       if (unlikely(rq->curr == p)) {
 853 +               unlock_task_rq(rq, &flags);
 854 +               goto repeat;
 855 +       }
 856 +       unlock_task_rq(rq, &flags);
 857  }
 858
 859 -static inline void move_last_runqueue(struct task_struct * p)
 860 +/*
 861 + * The SMP message passing code calls this function whenever
 862 + * the new task has arrived at the target CPU. We move the
 863 + * new task into the local runqueue.
 864 + *
 865 + * This function must be called with interrupts disabled.
 866 + */
 867 +void sched_task_migrated(task_t *new_task)
 868  {
 869 -       list_del(&p->run_list);
 870 -       list_add_tail(&p->run_list, &runqueue_head);
 871 +       wait_task_inactive(new_task);
 872 +       new_task->cpu = smp_processor_id();
 873 +       wake_up_process(new_task);
 874  }
 875
 876 -static inline void move_first_runqueue(struct task_struct * p)
 877 +/*
 878 + * Kick the remote CPU if the task is running currently,
 879 + * this code is used by the signal code to signal tasks
 880 + * which are in user-mode as quickly as possible.
 881 + *
 882 + * (Note that we do this lockless - if the task does anything
 883 + * while the message is in flight then it will notice the
 884 + * sigpending condition anyway.)
 885 + */
 886 +void kick_if_running(task_t * p)
 887  {
 888 -       list_del(&p->run_list);
 889 -       list_add(&p->run_list, &runqueue_head);
 890 +       if (p == task_rq(p)->curr)
 891 +               resched_task(p);
 892  }
 893 +#endif
 894
 895  /*
 896   * Wake up a process. Put it on the run-queue if it's not
 897 @@ -348,392 +321,528 @@
 898   * "current->state = TASK_RUNNING" to mark yourself runnable
 899   * without the overhead of this.
 900   */
 901 -static inline int try_to_wake_up(struct task_struct * p, int synchronous)
 902 +static int try_to_wake_up(task_t * p, int synchronous)
 903  {
 904         unsigned long flags;
 905         int success = 0;
 906 +       runqueue_t *rq;
 907
 908 -       /*
 909 -        * We want the common case fall through straight, thus the goto.
 910 -        */
 911 -       spin_lock_irqsave(&runqueue_lock, flags);
 912 +       rq = lock_task_rq(p, &flags);
 913         p->state = TASK_RUNNING;
 914 -       if (task_on_runqueue(p))
 915 -               goto out;
 916 -       add_to_runqueue(p);
 917 -       if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
 918 -               reschedule_idle(p);
 919 -       success = 1;
 920 -out:
 921 -       spin_unlock_irqrestore(&runqueue_lock, flags);
 922 +       if (!p->array) {
 923 +               activate_task(p, rq);
 924 +               if ((rq->curr == rq->idle) || (p->prio < rq->curr->prio))
 925 +                       resched_task(rq->curr);
 926 +               success = 1;
 927 +       }
 928 +       unlock_task_rq(rq, &flags);
 929         return success;
 930  }
 931
 932 -inline int wake_up_process(struct task_struct * p)
 933 +int wake_up_process(task_t * p)
 934  {
 935         return try_to_wake_up(p, 0);
 936  }
 937
 938 -static void process_timeout(unsigned long __data)
 939 +void wake_up_forked_process(task_t * p)
 940  {
 941 -       struct task_struct * p = (struct task_struct *) __data;
 942 +       runqueue_t *rq = this_rq();
 943
 944 -       wake_up_process(p);
 945 +       p->state = TASK_RUNNING;
 946 +       if (!rt_task(p)) {
 947 +               /*
 948 +                * We decrease the sleep average of forking parents
 949 +                * and children as well, to keep max-interactive tasks
 950 +                * from forking tasks that are max-interactive.
 951 +                */
 952 +               current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
 953 +               p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
 954 +               p->prio = effective_prio(p);
 955 +       }
 956 +       spin_lock_irq(&rq->lock);
 957 +       p->cpu = smp_processor_id();
 958 +       activate_task(p, rq);
 959 +       spin_unlock_irq(&rq->lock);
 960  }
 961
 962 -/**
 963 - * schedule_timeout - sleep until timeout
 964 - * @timeout: timeout value in jiffies
 965 - *
 966 - * Make the current task sleep until @timeout jiffies have
 967 - * elapsed. The routine will return immediately unless
 968 - * the current task state has been set (see set_current_state()).
 969 - *
 970 - * You can set the task state as follows -
 971 - *
 972 - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
 973 - * pass before the routine returns. The routine will return 0
 974 - *
 975 - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 976 - * delivered to the current task. In this case the remaining time
 977 - * in jiffies will be returned, or 0 if the timer expired in time
 978 - *
 979 - * The current task state is guaranteed to be TASK_RUNNING when this
 980 - * routine returns.
 981 - *
 982 - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
 983 - * the CPU away without a bound on the timeout. In this case the return
 984 - * value will be %MAX_SCHEDULE_TIMEOUT.
 985 +/*
 986 + * Potentially available exiting-child timeslices are
 987 + * retrieved here - this way the parent does not get
 988 + * penalized for creating too many processes.
 989   *
 990 - * In all cases the return value is guaranteed to be non-negative.
 991 + * (this cannot be used to 'generate' timeslices
 992 + * artificially, because any timeslice recovered here
 993 + * was given away by the parent in the first place.)
 994   */
 995 -signed long schedule_timeout(signed long timeout)
 996 +void sched_exit(task_t * p)
 997  {
 998 -       struct timer_list timer;
 999 -       unsigned long expire;
1000 +       __cli();
1001 +       current->time_slice += p->time_slice;
1002 +       if (unlikely(current->time_slice > MAX_TIMESLICE))
1003 +               current->time_slice = MAX_TIMESLICE;
1004 +       __sti();
1005 +       /*
1006 +        * If the child was a (relative-) CPU hog then decrease
1007 +        * the sleep_avg of the parent as well.
1008 +        */
1009 +       if (p->sleep_avg < current->sleep_avg)
1010 +               current->sleep_avg = (current->sleep_avg * EXIT_WEIGHT +
1011 +                       p->sleep_avg) / (EXIT_WEIGHT + 1);
1012 +}
1013
1014 -       switch (timeout)
1015 -       {
1016 -       case MAX_SCHEDULE_TIMEOUT:
1017 -               /*
1018 -                * These two special cases are useful to be comfortable
1019 -                * in the caller. Nothing more. We could take
1020 -                * MAX_SCHEDULE_TIMEOUT from one of the negative value
1021 -                * but I' d like to return a valid offset (>=0) to allow
1022 -                * the caller to do everything it want with the retval.
1023 -                */
1024 -               schedule();
1025 -               goto out;
1026 -       default:
1027 -               /*
1028 -                * Another bit of PARANOID. Note that the retval will be
1029 -                * 0 since no piece of kernel is supposed to do a check
1030 -                * for a negative retval of schedule_timeout() (since it
1031 -                * should never happens anyway). You just have the printk()
1032 -                * that will tell you if something is gone wrong and where.
1033 -                */
1034 -               if (timeout < 0)
1035 -               {
1036 -                       printk(KERN_ERR "schedule_timeout: wrong timeout "
1037 -                              "value %lx from %p\n", timeout,
1038 -                              __builtin_return_address(0));
1039 -                       current->state = TASK_RUNNING;
1040 -                       goto out;
1041 -               }
1042 -       }
1043 +#if CONFIG_SMP
1044 +asmlinkage void schedule_tail(task_t *prev)
1045 +{
1046 +       spin_unlock_irq(&this_rq()->lock);
1047 +}
1048 +#endif
1049
1050 -       expire = timeout + jiffies;
1051 +static inline void context_switch(task_t *prev, task_t *next)
1052 +{
1053 +       struct mm_struct *mm = next->mm;
1054 +       struct mm_struct *oldmm = prev->active_mm;
1055
1056 -       init_timer(&timer);
1057 -       timer.expires = expire;
1058 -       timer.data = (unsigned long) current;
1059 -       timer.function = process_timeout;
1060 +       prepare_to_switch();
1061
1062 -       add_timer(&timer);
1063 -       schedule();
1064 -       del_timer_sync(&timer);
1065 +       if (unlikely(!mm)) {
1066 +               next->active_mm = oldmm;
1067 +               atomic_inc(&oldmm->mm_count);
1068 +               enter_lazy_tlb(oldmm, next, smp_processor_id());
1069 +       } else
1070 +               switch_mm(oldmm, mm, next, smp_processor_id());
1071
1072 -       timeout = expire - jiffies;
1073 +       if (unlikely(!prev->mm)) {
1074 +               prev->active_mm = NULL;
1075 +               mmdrop(oldmm);
1076 +       }
1077
1078 - out:
1079 -       return timeout < 0 ? 0 : timeout;
1080 +       /*
1081 +        * Here we just switch the register state and the stack. There are
1082 +        * 3 processes affected by a context switch:
1083 +        *
1084 +        * prev ==> .... ==> (last => next)
1085 +        *
1086 +        * It's the 'much more previous' 'prev' that is on next's stack,
1087 +        * but prev is set to (the just run) 'last' process by switch_to().
1088 +        * This might sound slightly confusing but makes tons of sense.
1089 +        */
1090 +       switch_to(prev, next, prev);
1091  }
1092
1093 -/*
1094 - * schedule_tail() is getting called from the fork return path. This
1095 - * cleans up all remaining scheduler things, without impacting the
1096 - * common case.
1097 - */
1098 -static inline void __schedule_tail(struct task_struct *prev)
1099 +unsigned long nr_running(void)
1100  {
1101 -#ifdef CONFIG_SMP
1102 -       int policy;
1103 -
1104 -       /*
1105 -        * prev->policy can be written from here only before `prev'
1106 -        * can be scheduled (before setting prev->cpus_runnable to ~0UL).
1107 -        * Of course it must also be read before allowing prev
1108 -        * to be rescheduled, but since the write depends on the read
1109 -        * to complete, wmb() is enough. (the spin_lock() acquired
1110 -        * before setting cpus_runnable is not enough because the spin_lock()
1111 -        * common code semantics allows code outside the critical section
1112 -        * to enter inside the critical section)
1113 -        */
1114 -       policy = prev->policy;
1115 -       prev->policy = policy & ~SCHED_YIELD;
1116 -       wmb();
1117 +       unsigned long i, sum = 0;
1118
1119 -       /*
1120 -        * fast path falls through. We have to clear cpus_runnable before
1121 -        * checking prev->state to avoid a wakeup race. Protect against
1122 -        * the task exiting early.
1123 -        */
1124 -       task_lock(prev);
1125 -       task_release_cpu(prev);
1126 -       mb();
1127 -       if (prev->state == TASK_RUNNING)
1128 -               goto needs_resched;
1129 +       for (i = 0; i < smp_num_cpus; i++)
1130 +               sum += cpu_rq(cpu_logical_map(i))->nr_running;
1131
1132 -out_unlock:
1133 -       task_unlock(prev);      /* Synchronise here with release_task() if prev is TASK_ZOMBIE */
1134 -       return;
1135 +       return sum;
1136 +}
1137
1138 -       /*
1139 -        * Slow path - we 'push' the previous process and
1140 -        * reschedule_idle() will attempt to find a new
1141 -        * processor for it. (but it might preempt the
1142 -        * current process as well.) We must take the runqueue
1143 -        * lock and re-check prev->state to be correct. It might
1144 -        * still happen that this process has a preemption
1145 -        * 'in progress' already - but this is not a problem and
1146 -        * might happen in other circumstances as well.
1147 -        */
1148 -needs_resched:
1149 -       {
1150 -               unsigned long flags;
1151 +unsigned long nr_context_switches(void)
1152 +{
1153 +       unsigned long i, sum = 0;
1154
1155 -               /*
1156 -                * Avoid taking the runqueue lock in cases where
1157 -                * no preemption-check is necessery:
1158 -                */
1159 -               if ((prev == idle_task(smp_processor_id())) ||
1160 -                                               (policy & SCHED_YIELD))
1161 -                       goto out_unlock;
1162 +       for (i = 0; i < smp_num_cpus; i++)
1163 +               sum += cpu_rq(cpu_logical_map(i))->nr_switches;
1164
1165 -               spin_lock_irqsave(&runqueue_lock, flags);
1166 -               if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev))
1167 -                       reschedule_idle(prev);
1168 -               spin_unlock_irqrestore(&runqueue_lock, flags);
1169 -               goto out_unlock;
1170 -       }
1171 -#else
1172 -       prev->policy &= ~SCHED_YIELD;
1173 -#endif /* CONFIG_SMP */
1174 +       return sum;
1175  }
1176
1177 -asmlinkage void schedule_tail(struct task_struct *prev)
1178 +#if CONFIG_SMP
1179 +/*
1180 + * Lock the busiest runqueue as well, this_rq is locked already.
1181 + * Recalculate nr_running if we have to drop the runqueue lock.
1182 + */
1183 +static inline unsigned int double_lock_balance(runqueue_t *this_rq,
1184 +       runqueue_t *busiest, int this_cpu, int idle, unsigned int nr_running)
1185  {
1186 -       __schedule_tail(prev);
1187 +       if (unlikely(!spin_trylock(&busiest->lock))) {
1188 +               if (busiest < this_rq) {
1189 +                       spin_unlock(&this_rq->lock);
1190 +                       spin_lock(&busiest->lock);
1191 +                       spin_lock(&this_rq->lock);
1192 +                       /* Need to recalculate nr_running */
1193 +                       if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
1194 +                               nr_running = this_rq->nr_running;
1195 +                       else
1196 +                               nr_running = this_rq->prev_nr_running[this_cpu];
1197 +               } else
1198 +                       spin_lock(&busiest->lock);
1199 +       }
1200 +       return nr_running;
1201  }
1202
1203  /*
1204 - *  'schedule()' is the scheduler function. It's a very simple and nice
1205 - * scheduler: it's not perfect, but certainly works for most things.
1206 - *
1207 - * The goto is "interesting".
1208 + * Current runqueue is empty, or rebalance tick: if there is an
1209 + * inbalance (current runqueue is too short) then pull from
1210 + * busiest runqueue(s).
1211   *
1212 - *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
1213 - * tasks can run. It can not be killed, and it cannot sleep. The 'state'
1214 - * information in task[0] is never used.
1215 + * We call this with the current runqueue locked,
1216 + * irqs disabled.
1217   */
1218 -asmlinkage void schedule(void)
1219 +static void load_balance(runqueue_t *this_rq, int idle)
1220  {
1221 -       struct schedule_data * sched_data;
1222 -       struct task_struct *prev, *next, *p;
1223 -       struct list_head *tmp;
1224 -       int this_cpu, c;
1225 +       int imbalance, nr_running, load, max_load,
1226 +               idx, i, this_cpu = smp_processor_id();
1227 +       task_t *next = this_rq->idle, *tmp;
1228 +       runqueue_t *busiest, *rq_src;
1229 +       prio_array_t *array;
1230 +       list_t *head, *curr;
1231
1232 +       /*
1233 +        * We search all runqueues to find the most busy one.
1234 +        * We do this lockless to reduce cache-bouncing overhead,
1235 +        * we re-check the 'best' source CPU later on again, with
1236 +        * the lock held.
1237 +        *
1238 +        * We fend off statistical fluctuations in runqueue lengths by
1239 +        * saving the runqueue length during the previous load-balancing
1240 +        * operation and using the smaller one the current and saved lengths.
1241 +        * If a runqueue is long enough for a longer amount of time then
1242 +        * we recognize it and pull tasks from it.
1243 +        *
1244 +        * The 'current runqueue length' is a statistical maximum variable,
1245 +        * for that one we take the longer one - to avoid fluctuations in
1246 +        * the other direction. So for a load-balance to happen it needs
1247 +        * stable long runqueue on the target CPU and stable short runqueue
1248 +        * on the local runqueue.
1249 +        *
1250 +        * We make an exception if this CPU is about to become idle - in
1251 +        * that case we are less picky about moving a task across CPUs and
1252 +        * take what can be taken.
1253 +        */
1254 +       if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
1255 +               nr_running = this_rq->nr_running;
1256 +       else
1257 +               nr_running = this_rq->prev_nr_running[this_cpu];
1258
1259 -       spin_lock_prefetch(&runqueue_lock);
1260 +       busiest = NULL;
1261 +       max_load = 1;
1262 +       for (i = 0; i < smp_num_cpus; i++) {
1263 +               rq_src = cpu_rq(cpu_logical_map(i));
1264 +               if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i]))
1265 +                       load = rq_src->nr_running;
1266 +               else
1267 +                       load = this_rq->prev_nr_running[i];
1268 +               this_rq->prev_nr_running[i] = rq_src->nr_running;
1269 +
1270 +               if ((load > max_load) && (rq_src != this_rq)) {
1271 +                       busiest = rq_src;
1272 +                       max_load = load;
1273 +               }
1274 +       }
1275
1276 -       if (!current->active_mm) BUG();
1277 -need_resched_back:
1278 -       prev = current;
1279 -       this_cpu = prev->processor;
1280 +       if (likely(!busiest))
1281 +               return;
1282
1283 -       if (unlikely(in_interrupt())) {
1284 -               printk("Scheduling in interrupt\n");
1285 -               BUG();
1286 -       }
1287 +       imbalance = (max_load - nr_running) / 2;
1288
1289 -       release_kernel_lock(prev, this_cpu);
1290 +       /* It needs an at least ~25% imbalance to trigger balancing. */
1291 +       if (!idle && (imbalance < (max_load + 3)/4))
1292 +               return;
1293
1294 +       nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running);
1295         /*
1296 -        * 'sched_data' is protected by the fact that we can run
1297 -        * only one process per CPU.
1298 +        * Make sure nothing changed since we checked the
1299 +        * runqueue length.
1300          */
1301 -       sched_data = & aligned_data[this_cpu].schedule_data;
1302 +       if (busiest->nr_running <= this_rq->nr_running + 1)
1303 +               goto out_unlock;
1304
1305 -       spin_lock_irq(&runqueue_lock);
1306 +       /*
1307 +        * We first consider expired tasks. Those will likely not be
1308 +        * executed in the near future, and they are most likely to
1309 +        * be cache-cold, thus switching CPUs has the least effect
1310 +        * on them.
1311 +        */
1312 +       if (busiest->expired->nr_active)
1313 +               array = busiest->expired;
1314 +       else
1315 +               array = busiest->active;
1316
1317 -       /* move an exhausted RR process to be last.. */
1318 -       if (unlikely(prev->policy == SCHED_RR))
1319 -               if (!prev->counter) {
1320 -                       prev->counter = NICE_TO_TICKS(prev->nice);
1321 -                       move_last_runqueue(prev);
1322 +new_array:
1323 +       /* Start searching at priority 0: */
1324 +       idx = 0;
1325 +skip_bitmap:
1326 +       if (!idx)
1327 +               idx = sched_find_first_bit(array->bitmap);
1328 +       else
1329 +               idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1330 +       if (idx == MAX_PRIO) {
1331 +               if (array == busiest->expired) {
1332 +                       array = busiest->active;
1333 +                       goto new_array;
1334                 }
1335 -
1336 -       switch (prev->state) {
1337 -               case TASK_INTERRUPTIBLE:
1338 -                       if (signal_pending(prev)) {
1339 -                               prev->state = TASK_RUNNING;
1340 -                               break;
1341 -                       }
1342 -               default:
1343 -                       del_from_runqueue(prev);
1344 -               case TASK_RUNNING:;
1345 +               goto out_unlock;
1346         }
1347 -       prev->need_resched = 0;
1348 -
1349 -       /*
1350 -        * this is the scheduler proper:
1351 -        */
1352
1353 -repeat_schedule:
1354 -       /*
1355 -        * Default process to select..
1356 -        */
1357 -       next = idle_task(this_cpu);
1358 -       c = -1000;
1359 -       list_for_each(tmp, &runqueue_head) {
1360 -               p = list_entry(tmp, struct task_struct, run_list);
1361 -               if (can_schedule(p, this_cpu)) {
1362 -                       int weight = goodness(p, this_cpu, prev->active_mm);
1363 -                       if (weight > c)
1364 -                               c = weight, next = p;
1365 +       head = array->queue + idx;
1366 +       curr = head->prev;
1367 +skip_queue:
1368 +       tmp = list_entry(curr, task_t, run_list);
1369 +
1370 +       /*
1371 +        * We do not migrate tasks that are:
1372 +        * 1) running (obviously), or
1373 +        * 2) cannot be migrated to this CPU due to cpus_allowed, or
1374 +        * 3) are cache-hot on their current CPU.
1375 +        */
1376 +
1377 +#define CAN_MIGRATE_TASK(p,rq,this_cpu)                                        \
1378 +       ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) &&        \
1379 +               ((p) != (rq)->curr) &&                                  \
1380 +                       (tmp->cpus_allowed & (1 << (this_cpu))))
1381 +
1382 +       if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
1383 +               curr = curr->next;
1384 +               if (curr != head)
1385 +                       goto skip_queue;
1386 +               idx++;
1387 +               goto skip_bitmap;
1388 +       }
1389 +       next = tmp;
1390 +       /*
1391 +        * take the task out of the other runqueue and
1392 +        * put it into this one:
1393 +        */
1394 +       dequeue_task(next, array);
1395 +       busiest->nr_running--;
1396 +       next->cpu = this_cpu;
1397 +       this_rq->nr_running++;
1398 +       enqueue_task(next, this_rq->active);
1399 +       if (next->prio < current->prio)
1400 +               current->need_resched = 1;
1401 +       if (!idle && --imbalance) {
1402 +               if (array == busiest->expired) {
1403 +                       array = busiest->active;
1404 +                       goto new_array;
1405                 }
1406         }
1407 +out_unlock:
1408 +       spin_unlock(&busiest->lock);
1409 +}
1410 +
1411 +/*
1412 + * One of the idle_cpu_tick() or the busy_cpu_tick() function will
1413 + * gets called every timer tick, on every CPU. Our balancing action
1414 + * frequency and balancing agressivity depends on whether the CPU is
1415 + * idle or not.
1416 + *
1417 + * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on
1418 + * systems with HZ=100, every 10 msecs.)
1419 + */
1420 +#define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
1421 +#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
1422 +
1423 +static inline void idle_tick(void)
1424 +{
1425 +       if (jiffies % IDLE_REBALANCE_TICK)
1426 +               return;
1427 +       spin_lock(&this_rq()->lock);
1428 +       load_balance(this_rq(), 1);
1429 +       spin_unlock(&this_rq()->lock);
1430 +}
1431 +
1432 +#endif
1433
1434 -       /* Do we need to re-calculate counters? */
1435 -       if (unlikely(!c)) {
1436 -               struct task_struct *p;
1437 -
1438 -               spin_unlock_irq(&runqueue_lock);
1439 -               read_lock(&tasklist_lock);
1440 -               for_each_task(p)
1441 -                       p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
1442 -               read_unlock(&tasklist_lock);
1443 -               spin_lock_irq(&runqueue_lock);
1444 -               goto repeat_schedule;
1445 +/*
1446 + * We place interactive tasks back into the active array, if possible.
1447 + *
1448 + * To guarantee that this does not starve expired tasks we ignore the
1449 + * interactivity of a task if the first expired task had to wait more
1450 + * than a 'reasonable' amount of time. This deadline timeout is
1451 + * load-dependent, as the frequency of array switched decreases with
1452 + * increasing number of running tasks:
1453 + */
1454 +#define EXPIRED_STARVING(rq) \
1455 +               ((rq)->expired_timestamp && \
1456 +               (jiffies - (rq)->expired_timestamp >= \
1457 +                       STARVATION_LIMIT * ((rq)->nr_running) + 1))
1458 +
1459 +/*
1460 + * This function gets called by the timer code, with HZ frequency.
1461 + * We call it with interrupts disabled.
1462 + */
1463 +void scheduler_tick(int user_tick, int system)
1464 +{
1465 +       int cpu = smp_processor_id();
1466 +       runqueue_t *rq = this_rq();
1467 +       task_t *p = current;
1468 +
1469 +       if (p == rq->idle) {
1470 +               if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
1471 +                       kstat.per_cpu_system[cpu] += system;
1472 +#if CONFIG_SMP
1473 +               idle_tick();
1474 +#endif
1475 +               return;
1476         }
1477 +       if (TASK_NICE(p) > 0)
1478 +               kstat.per_cpu_nice[cpu] += user_tick;
1479 +       else
1480 +               kstat.per_cpu_user[cpu] += user_tick;
1481 +       kstat.per_cpu_system[cpu] += system;
1482
1483 +       /* Task might have expired already, but not scheduled off yet */
1484 +       if (p->array != rq->active) {
1485 +               p->need_resched = 1;
1486 +               return;
1487 +       }
1488 +       spin_lock(&rq->lock);
1489 +       if (unlikely(rt_task(p))) {
1490 +               /*
1491 +                * RR tasks need a special form of timeslice management.
1492 +                * FIFO tasks have no timeslices.
1493 +                */
1494 +               if ((p->policy == SCHED_RR) && !--p->time_slice) {
1495 +                       p->time_slice = TASK_TIMESLICE(p);
1496 +                       p->need_resched = 1;
1497 +
1498 +                       /* put it at the end of the queue: */
1499 +                       dequeue_task(p, rq->active);
1500 +                       enqueue_task(p, rq->active);
1501 +               }
1502 +               goto out;
1503 +       }
1504         /*
1505 -        * from this point on nothing can prevent us from
1506 -        * switching to the next task, save this fact in
1507 -        * sched_data.
1508 -        */
1509 -       sched_data->curr = next;
1510 -       task_set_cpu(next, this_cpu);
1511 -       spin_unlock_irq(&runqueue_lock);
1512 -
1513 -       if (unlikely(prev == next)) {
1514 -               /* We won't go through the normal tail, so do this by hand */
1515 -               prev->policy &= ~SCHED_YIELD;
1516 -               goto same_process;
1517 +        * The task was running during this tick - update the
1518 +        * time slice counter and the sleep average. Note: we
1519 +        * do not update a process's priority until it either
1520 +        * goes to sleep or uses up its timeslice. This makes
1521 +        * it possible for interactive tasks to use up their
1522 +        * timeslices at their highest priority levels.
1523 +        */
1524 +       if (p->sleep_avg)
1525 +               p->sleep_avg--;
1526 +       if (!--p->time_slice) {
1527 +               dequeue_task(p, rq->active);
1528 +               p->need_resched = 1;
1529 +               p->prio = effective_prio(p);
1530 +               p->time_slice = TASK_TIMESLICE(p);
1531 +
1532 +               if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
1533 +                       if (!rq->expired_timestamp)
1534 +                               rq->expired_timestamp = jiffies;
1535 +                       enqueue_task(p, rq->expired);
1536 +               } else
1537 +                       enqueue_task(p, rq->active);
1538         }
1539 +out:
1540 +#if CONFIG_SMP
1541 +       if (!(jiffies % BUSY_REBALANCE_TICK))
1542 +               load_balance(rq, 0);
1543 +#endif
1544 +       spin_unlock(&rq->lock);
1545 +}
1546
1547 -#ifdef CONFIG_SMP
1548 -       /*
1549 -        * maintain the per-process 'last schedule' value.
1550 -        * (this has to be recalculated even if we reschedule to
1551 -        * the same process) Currently this is only used on SMP,
1552 -        * and it's approximate, so we do not have to maintain
1553 -        * it while holding the runqueue spinlock.
1554 -        */
1555 -       sched_data->last_schedule = get_cycles();
1556 +void scheduling_functions_start_here(void) { }
1557
1558 -       /*
1559 -        * We drop the scheduler lock early (it's a global spinlock),
1560 -        * thus we have to lock the previous process from getting
1561 -        * rescheduled during switch_to().
1562 -        */
1563 +/*
1564 + * 'schedule()' is the main scheduler function.
1565 + */
1566 +asmlinkage void schedule(void)
1567 +{
1568 +       task_t *prev = current, *next;
1569 +       runqueue_t *rq = this_rq();
1570 +       prio_array_t *array;
1571 +       list_t *queue;
1572 +       int idx;
1573
1574 -#endif /* CONFIG_SMP */
1575 +       if (unlikely(in_interrupt()))
1576 +               BUG();
1577 +       release_kernel_lock(prev, smp_processor_id());
1578 +       prev->sleep_timestamp = jiffies;
1579 +       spin_lock_irq(&rq->lock);
1580
1581 -       kstat.context_swtch++;
1582 -       /*
1583 -        * there are 3 processes which are affected by a context switch:
1584 -        *
1585 -        * prev == .... ==> (last => next)
1586 -        *
1587 -        * It's the 'much more previous' 'prev' that is on next's stack,
1588 -        * but prev is set to (the just run) 'last' process by switch_to().
1589 -        * This might sound slightly confusing but makes tons of sense.
1590 -        */
1591 -       prepare_to_switch();
1592 -       {
1593 -               struct mm_struct *mm = next->mm;
1594 -               struct mm_struct *oldmm = prev->active_mm;
1595 -               if (!mm) {
1596 -                       if (next->active_mm) BUG();
1597 -                       next->active_mm = oldmm;
1598 -                       atomic_inc(&oldmm->mm_count);
1599 -                       enter_lazy_tlb(oldmm, next, this_cpu);
1600 -               } else {
1601 -                       if (next->active_mm != mm) BUG();
1602 -                       switch_mm(oldmm, mm, next, this_cpu);
1603 +       switch (prev->state) {
1604 +       case TASK_INTERRUPTIBLE:
1605 +               if (unlikely(signal_pending(prev))) {
1606 +                       prev->state = TASK_RUNNING;
1607 +                       break;
1608                 }
1609 +       default:
1610 +               deactivate_task(prev, rq);
1611 +       case TASK_RUNNING:
1612 +               ;
1613 +       }
1614 +#if CONFIG_SMP
1615 +pick_next_task:
1616 +#endif
1617 +       if (unlikely(!rq->nr_running)) {
1618 +#if CONFIG_SMP
1619 +               load_balance(rq, 1);
1620 +               if (rq->nr_running)
1621 +                       goto pick_next_task;
1622 +#endif
1623 +               next = rq->idle;
1624 +               rq->expired_timestamp = 0;
1625 +               goto switch_tasks;
1626 +       }
1627
1628 -               if (!prev->mm) {
1629 -                       prev->active_mm = NULL;
1630 -                       mmdrop(oldmm);
1631 -               }
1632 +       array = rq->active;
1633 +       if (unlikely(!array->nr_active)) {
1634 +               /*
1635 +                * Switch the active and expired arrays.
1636 +                */
1637 +               rq->active = rq->expired;
1638 +               rq->expired = array;
1639 +               array = rq->active;
1640 +               rq->expired_timestamp = 0;
1641         }
1642
1643 -       /*
1644 -        * This just switches the register state and the
1645 -        * stack.
1646 -        */
1647 -       switch_to(prev, next, prev);
1648 -       __schedule_tail(prev);
1649 +       idx = sched_find_first_bit(array->bitmap);
1650 +       queue = array->queue + idx;
1651 +       next = list_entry(queue->next, task_t, run_list);
1652 +
1653 +switch_tasks:
1654 +       prefetch(next);
1655 +       prev->need_resched = 0;
1656 +
1657 +       if (likely(prev != next)) {
1658 +               rq->nr_switches++;
1659 +               rq->curr = next;
1660 +               context_switch(prev, next);
1661 +               /*
1662 +                * The runqueue pointer might be from another CPU
1663 +                * if the new task was last running on a different
1664 +                * CPU - thus re-load it.
1665 +                */
1666 +               barrier();
1667 +               rq = this_rq();
1668 +       }
1669 +       spin_unlock_irq(&rq->lock);
1670
1671 -same_process:
1672         reacquire_kernel_lock(current);
1673 -       if (current->need_resched)
1674 -               goto need_resched_back;
1675         return;
1676  }
1677
1678  /*
1679 - * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just wake everything
1680 - * up.  If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the
1681 - * non-exclusive tasks and one exclusive task.
1682 + * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
1683 + * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
1684 + * number) then we wake all the non-exclusive tasks and one exclusive task.
1685   *
1686   * There are circumstances in which we can try to wake a task which has already
1687 - * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns zero
1688 - * in this (rare) case, and we handle it by contonuing to scan the queue.
1689 + * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
1690 + * zero in this (rare) case, and we handle it by continuing to scan the queue.
1691   */
1692  static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
1693                                      int nr_exclusive, const int sync)
1694  {
1695         struct list_head *tmp;
1696 -       struct task_struct *p;
1697 +       task_t *p;
1698
1699 -       CHECK_MAGIC_WQHEAD(q);
1700 -       WQ_CHECK_LIST_HEAD(&q->task_list);
1701 -
1702         list_for_each(tmp,&q->task_list) {
1703                 unsigned int state;
1704 -                wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
1705 +               wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
1706
1707 -               CHECK_MAGIC(curr->__magic);
1708                 p = curr->task;
1709                 state = p->state;
1710 -               if (state & mode) {
1711 -                       WQ_NOTE_WAKER(curr);
1712 -                       if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
1713 -                               break;
1714 -               }
1715 +               if ((state & mode) &&
1716 +                               try_to_wake_up(p, sync) &&
1717 +                               ((curr->flags & WQ_FLAG_EXCLUSIVE) &&
1718 +                                       !--nr_exclusive))
1719 +                       break;
1720         }
1721  }
1722
1723 @@ -850,8 +959,71 @@
1724         return timeout;
1725  }
1726
1727 +/*
1728 + * Change the current task's CPU affinity. Migrate the process to a
1729 + * proper CPU and schedule away if the current CPU is removed from
1730 + * the allowed bitmask.
1731 + */
1732 +void set_cpus_allowed(task_t *p, unsigned long new_mask)
1733 +{
1734 +       new_mask &= cpu_online_map;
1735 +       if (!new_mask)
1736 +               BUG();
1737 +       if (p != current)
1738 +               BUG();
1739 +
1740 +       p->cpus_allowed = new_mask;
1741 +       /*
1742 +        * Can the task run on the current CPU? If not then
1743 +        * migrate the process off to a proper CPU.
1744 +        */
1745 +       if (new_mask & (1UL << smp_processor_id()))
1746 +               return;
1747 +#if CONFIG_SMP
1748 +       current->state = TASK_UNINTERRUPTIBLE;
1749 +       smp_migrate_task(__ffs(new_mask), current);
1750 +
1751 +       schedule();
1752 +#endif
1753 +}
1754 +
1755  void scheduling_functions_end_here(void) { }
1756
1757 +void set_user_nice(task_t *p, long nice)
1758 +{
1759 +       unsigned long flags;
1760 +       prio_array_t *array;
1761 +       runqueue_t *rq;
1762 +
1763 +       if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
1764 +               return;
1765 +       /*
1766 +        * We have to be careful, if called from sys_setpriority(),
1767 +        * the task might be in the middle of scheduling on another CPU.
1768 +        */
1769 +       rq = lock_task_rq(p, &flags);
1770 +       if (rt_task(p)) {
1771 +               p->static_prio = NICE_TO_PRIO(nice);
1772 +               goto out_unlock;
1773 +       }
1774 +       array = p->array;
1775 +       if (array)
1776 +               dequeue_task(p, array);
1777 +       p->static_prio = NICE_TO_PRIO(nice);
1778 +       p->prio = NICE_TO_PRIO(nice);
1779 +       if (array) {
1780 +               enqueue_task(p, array);
1781 +               /*
1782 +                * If the task is running and lowered its priority,
1783 +                * or increased its priority then reschedule its CPU:
1784 +                */
1785 +               if ((NICE_TO_PRIO(nice) < p->static_prio) || (p == rq->curr))
1786 +                       resched_task(rq->curr);
1787 +       }
1788 +out_unlock:
1789 +       unlock_task_rq(rq, &flags);
1790 +}
1791 +
1792  #ifndef __alpha__
1793
1794  /*
1795 @@ -862,7 +1034,7 @@
1796
1797  asmlinkage long sys_nice(int increment)
1798  {
1799 -       long newprio;
1800 +       long nice;
1801
1802         /*
1803          *      Setpriority might change our priority at the same moment.
1804 @@ -878,32 +1050,46 @@
1805         if (increment > 40)
1806                 increment = 40;
1807
1808 -       newprio = current->nice + increment;
1809 -       if (newprio < -20)
1810 -               newprio = -20;
1811 -       if (newprio > 19)
1812 -               newprio = 19;
1813 -       current->nice = newprio;
1814 +       nice = PRIO_TO_NICE(current->static_prio) + increment;
1815 +       if (nice < -20)
1816 +               nice = -20;
1817 +       if (nice > 19)
1818 +               nice = 19;
1819 +       set_user_nice(current, nice);
1820         return 0;
1821  }
1822
1823  #endif
1824
1825 -static inline struct task_struct *find_process_by_pid(pid_t pid)
1826 +/*
1827 + * This is the priority value as seen by users in /proc
1828 + *
1829 + * RT tasks are offset by -200. Normal tasks are centered
1830 + * around 0, value goes from -16 to +15.
1831 + */
1832 +int task_prio(task_t *p)
1833  {
1834 -       struct task_struct *tsk = current;
1835 +       return p->prio - 100;
1836 +}
1837
1838 -       if (pid)
1839 -               tsk = find_task_by_pid(pid);
1840 -       return tsk;
1841 +int task_nice(task_t *p)
1842 +{
1843 +       return TASK_NICE(p);
1844 +}
1845 +
1846 +static inline task_t *find_process_by_pid(pid_t pid)
1847 +{
1848 +       return pid ? find_task_by_pid(pid) : current;
1849  }
1850
1851 -static int setscheduler(pid_t pid, int policy,
1852 -                       struct sched_param *param)
1853 +static int setscheduler(pid_t pid, int policy, struct sched_param *param)
1854  {
1855         struct sched_param lp;
1856 -       struct task_struct *p;
1857 +       prio_array_t *array;
1858 +       unsigned long flags;
1859 +       runqueue_t *rq;
1860         int retval;
1861 +       task_t *p;
1862
1863         retval = -EINVAL;
1864         if (!param || pid < 0)
1865 @@ -917,14 +1103,19 @@
1866          * We play safe to avoid deadlocks.
1867          */
1868         read_lock_irq(&tasklist_lock);
1869 -       spin_lock(&runqueue_lock);
1870
1871         p = find_process_by_pid(pid);
1872
1873         retval = -ESRCH;
1874         if (!p)
1875 -               goto out_unlock;
1876 -
1877 +               goto out_unlock_tasklist;
1878 +
1879 +       /*
1880 +        * To be able to change p->policy safely, the apropriate
1881 +        * runqueue lock must be held.
1882 +        */
1883 +       rq = lock_task_rq(p, &flags);
1884 +
1885         if (policy < 0)
1886                 policy = p->policy;
1887         else {
1888 @@ -945,30 +1136,36 @@
1889                 goto out_unlock;
1890
1891         retval = -EPERM;
1892 -       if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
1893 +       if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
1894             !capable(CAP_SYS_NICE))
1895                 goto out_unlock;
1896         if ((current->euid != p->euid) && (current->euid != p->uid) &&
1897             !capable(CAP_SYS_NICE))
1898                 goto out_unlock;
1899
1900 +       array = p->array;
1901 +       if (array)
1902 +               deactivate_task(p, task_rq(p));
1903         retval = 0;
1904         p->policy = policy;
1905         p->rt_priority = lp.sched_priority;
1906 -       if (task_on_runqueue(p))
1907 -               move_first_runqueue(p);
1908 -
1909 -       current->need_resched = 1;
1910 +       if (rt_task(p))
1911 +               p->prio = 99 - p->rt_priority;
1912 +       else
1913 +               p->prio = p->static_prio;
1914 +       if (array)
1915 +               activate_task(p, task_rq(p));
1916
1917  out_unlock:
1918 -       spin_unlock(&runqueue_lock);
1919 +       unlock_task_rq(rq, &flags);
1920 +out_unlock_tasklist:
1921         read_unlock_irq(&tasklist_lock);
1922
1923  out_nounlock:
1924         return retval;
1925  }
1926
1927 -asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
1928 +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
1929                                       struct sched_param *param)
1930  {
1931         return setscheduler(pid, policy, param);
1932 @@ -981,7 +1178,7 @@
1933
1934  asmlinkage long sys_sched_getscheduler(pid_t pid)
1935  {
1936 -       struct task_struct *p;
1937 +       task_t *p;
1938         int retval;
1939
1940         retval = -EINVAL;
1941 @@ -992,7 +1189,7 @@
1942         read_lock(&tasklist_lock);
1943         p = find_process_by_pid(pid);
1944         if (p)
1945 -               retval = p->policy & ~SCHED_YIELD;
1946 +               retval = p->policy;
1947         read_unlock(&tasklist_lock);
1948
1949  out_nounlock:
1950 @@ -1001,7 +1198,7 @@
1951
1952  asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
1953  {
1954 -       struct task_struct *p;
1955 +       task_t *p;
1956         struct sched_param lp;
1957         int retval;
1958
1959 @@ -1032,42 +1229,64 @@
1960
1961  asmlinkage long sys_sched_yield(void)
1962  {
1963 +       task_t *prev = current, *next;
1964 +       runqueue_t *rq = this_rq();
1965 +       prio_array_t *array;
1966 +       list_t *queue;
1967 +
1968 +       if (unlikely(prev->state != TASK_RUNNING)) {
1969 +               schedule();
1970 +               return 0;
1971 +       }
1972 +       release_kernel_lock(prev, smp_processor_id());
1973 +       prev->sleep_timestamp = jiffies;
1974         /*
1975 -        * Trick. sched_yield() first counts the number of truly
1976 -        * 'pending' runnable processes, then returns if it's
1977 -        * only the current processes. (This test does not have
1978 -        * to be atomic.) In threaded applications this optimization
1979 -        * gets triggered quite often.
1980 +        * Decrease the yielding task's priority by one, to avoid
1981 +        * livelocks. This priority loss is temporary, it's recovered
1982 +        * once the current timeslice expires.
1983 +        *
1984 +        * If priority is already MAX_PRIO-1 then we still
1985 +        * roundrobin the task within the runlist.
1986          */
1987 +       spin_lock_irq(&rq->lock);
1988 +       array = current->array;
1989 +       /*
1990 +        * If the task has reached maximum priority (or is a RT task)
1991 +        * then just requeue the task to the end of the runqueue:
1992 +        */
1993 +       if (likely(current->prio == MAX_PRIO-1 || rt_task(current))) {
1994 +               list_del(&current->run_list);
1995 +               list_add_tail(&current->run_list, array->queue + current->prio);
1996 +       } else {
1997 +               list_del(&current->run_list);
1998 +               if (list_empty(array->queue + current->prio))
1999 +                       __clear_bit(current->prio, array->bitmap);
2000 +               current->prio++;
2001 +               list_add_tail(&current->run_list, array->queue + current->prio);
2002 +               __set_bit(current->prio, array->bitmap);
2003 +       }
2004 +       /*
2005 +        * Context-switch manually. This is equivalent to
2006 +        * calling schedule(), but faster, because yield()
2007 +        * knows lots of things that can be optimized away
2008 +        * from the generic scheduler path:
2009 +        */
2010 +       queue = array->queue + sched_find_first_bit(array->bitmap);
2011 +       next = list_entry(queue->next, task_t, run_list);
2012 +       prefetch(next);
2013
2014 -       int nr_pending = nr_running;
2015 -
2016 -#if CONFIG_SMP
2017 -       int i;
2018 -
2019 -       // Subtract non-idle processes running on other CPUs.
2020 -       for (i = 0; i < smp_num_cpus; i++) {
2021 -               int cpu = cpu_logical_map(i);
2022 -               if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
2023 -                       nr_pending--;
2024 +       prev->need_resched = 0;
2025 +       if (likely(prev != next)) {
2026 +               rq->nr_switches++;
2027 +               rq->curr = next;
2028 +               context_switch(prev, next);
2029 +               barrier();
2030 +               rq = this_rq();
2031         }
2032 -#else
2033 -       // on UP this process is on the runqueue as well
2034 -       nr_pending--;
2035 -#endif
2036 -       if (nr_pending) {
2037 -               /*
2038 -                * This process can only be rescheduled by us,
2039 -                * so this is safe without any locking.
2040 -                */
2041 -               if (current->policy == SCHED_OTHER)
2042 -                       current->policy |= SCHED_YIELD;
2043 -               current->need_resched = 1;
2044 +       spin_unlock_irq(&rq->lock);
2045 +
2046 +       reacquire_kernel_lock(current);
2047
2048 -               spin_lock_irq(&runqueue_lock);
2049 -               move_last_runqueue(current);
2050 -               spin_unlock_irq(&runqueue_lock);
2051 -       }
2052         return 0;
2053  }
2054
2055 @@ -1105,7 +1324,7 @@
2056  asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
2057  {
2058         struct timespec t;
2059 -       struct task_struct *p;
2060 +       task_t *p;
2061         int retval = -EINVAL;
2062
2063         if (pid < 0)
2064 @@ -1115,8 +1334,8 @@
2065         read_lock(&tasklist_lock);
2066         p = find_process_by_pid(pid);
2067         if (p)
2068 -               jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice),
2069 -                                   &t);
2070 +               jiffies_to_timespec(p->policy & SCHED_FIFO ?
2071 +                                        0 : TASK_TIMESLICE(p), &t);
2072         read_unlock(&tasklist_lock);
2073         if (p)
2074                 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
2075 @@ -1124,14 +1343,14 @@
2076         return retval;
2077  }
2078
2079 -static void show_task(struct task_struct * p)
2080 +static void show_task(task_t * p)
2081  {
2082         unsigned long free = 0;
2083         int state;
2084         static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
2085
2086         printk("%-13.13s ", p->comm);
2087 -       state = p->state ? ffz(~p->state) + 1 : 0;
2088 +       state = p->state ? __ffs(p->state) + 1 : 0;
2089         if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
2090                 printk(stat_nam[state]);
2091         else
2092 @@ -1172,7 +1391,7 @@
2093                 printk(" (NOTLB)\n");
2094
2095         {
2096 -               extern void show_trace_task(struct task_struct *tsk);
2097 +               extern void show_trace_task(task_t *tsk);
2098                 show_trace_task(p);
2099         }
2100  }
2101 @@ -1194,7 +1413,7 @@
2102
2103  void show_state(void)
2104  {
2105 -       struct task_struct *p;
2106 +       task_t *p;
2107
2108  #if (BITS_PER_LONG == 32)
2109         printk("\n"
2110 @@ -1217,121 +1436,88 @@
2111         read_unlock(&tasklist_lock);
2112  }
2113
2114 -/**
2115 - * reparent_to_init() - Reparent the calling kernel thread to the init task.
2116 - *
2117 - * If a kernel thread is launched as a result of a system call, or if
2118 - * it ever exits, it should generally reparent itself to init so that
2119 - * it is correctly cleaned up on exit.
2120 - *
2121 - * The various task state such as scheduling policy and priority may have
2122 - * been inherited fro a user process, so we reset them to sane values here.
2123 - *
2124 - * NOTE that reparent_to_init() gives the caller full capabilities.
2125 - */
2126 -void reparent_to_init(void)
2127 +static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
2128  {
2129 -       struct task_struct *this_task = current;
2130 -
2131 -       write_lock_irq(&tasklist_lock);
2132 -
2133 -       /* Reparent to init */
2134 -       REMOVE_LINKS(this_task);
2135 -       this_task->p_pptr = child_reaper;
2136 -       this_task->p_opptr = child_reaper;
2137 -       SET_LINKS(this_task);
2138 -
2139 -       /* Set the exit signal to SIGCHLD so we signal init on exit */
2140 -       this_task->exit_signal = SIGCHLD;
2141 -
2142 -       /* We also take the runqueue_lock while altering task fields
2143 -        * which affect scheduling decisions */
2144 -       spin_lock(&runqueue_lock);
2145 -
2146 -       this_task->ptrace = 0;
2147 -       this_task->nice = DEF_NICE;
2148 -       this_task->policy = SCHED_OTHER;
2149 -       /* cpus_allowed? */
2150 -       /* rt_priority? */
2151 -       /* signals? */
2152 -       this_task->cap_effective = CAP_INIT_EFF_SET;
2153 -       this_task->cap_inheritable = CAP_INIT_INH_SET;
2154 -       this_task->cap_permitted = CAP_FULL_SET;
2155 -       this_task->keep_capabilities = 0;
2156 -       memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim)));
2157 -       this_task->user = INIT_USER;
2158 -
2159 -       spin_unlock(&runqueue_lock);
2160 -       write_unlock_irq(&tasklist_lock);
2161 +       if (rq1 == rq2)
2162 +               spin_lock(&rq1->lock);
2163 +       else {
2164 +               if (rq1 < rq2) {
2165 +                       spin_lock(&rq1->lock);
2166 +                       spin_lock(&rq2->lock);
2167 +               } else {
2168 +                       spin_lock(&rq2->lock);
2169 +                       spin_lock(&rq1->lock);
2170 +               }
2171 +       }
2172  }
2173
2174 -/*
2175 - *     Put all the gunge required to become a kernel thread without
2176 - *     attached user resources in one place where it belongs.
2177 - */
2178 -
2179 -void daemonize(void)
2180 +static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
2181  {
2182 -       struct fs_struct *fs;
2183 -
2184 -
2185 -       /*
2186 -        * If we were started as result of loading a module, close all of the
2187 -        * user space pages.  We don't need them, and if we didn't close them
2188 -        * they would be locked into memory.
2189 -        */
2190 -       exit_mm(current);
2191 -
2192 -       current->session = 1;
2193 -       current->pgrp = 1;
2194 -       current->tty = NULL;
2195 -
2196 -       /* Become as one with the init task */
2197 -
2198 -       exit_fs(current);       /* current->fs->count--; */
2199 -       fs = init_task.fs;
2200 -       current->fs = fs;
2201 -       atomic_inc(&fs->count);
2202 -       exit_files(current);
2203 -       current->files = init_task.files;
2204 -       atomic_inc(&current->files->count);
2205 +       spin_unlock(&rq1->lock);
2206 +       if (rq1 != rq2)
2207 +               spin_unlock(&rq2->lock);
2208  }
2209
2210 -extern unsigned long wait_init_idle;
2211 -
2212 -void __init init_idle(void)
2213 +void __init init_idle(task_t *idle, int cpu)
2214  {
2215 -       struct schedule_data * sched_data;
2216 -       sched_data = &aligned_data[smp_processor_id()].schedule_data;
2217 +       runqueue_t *idle_rq = cpu_rq(cpu), *rq = idle->array->rq;
2218 +       unsigned long flags;
2219
2220 -       if (current != &init_task && task_on_runqueue(current)) {
2221 -               printk("UGH! (%d:%d) was on the runqueue, removing.\n",
2222 -                       smp_processor_id(), current->pid);
2223 -               del_from_runqueue(current);
2224 -       }
2225 -       sched_data->curr = current;
2226 -       sched_data->last_schedule = get_cycles();
2227 -       clear_bit(current->processor, &wait_init_idle);
2228 +       __save_flags(flags);
2229 +       __cli();
2230 +       double_rq_lock(idle_rq, rq);
2231 +
2232 +       idle_rq->curr = idle_rq->idle = idle;
2233 +       deactivate_task(idle, rq);
2234 +       idle->array = NULL;
2235 +       idle->prio = MAX_PRIO;
2236 +       idle->state = TASK_RUNNING;
2237 +       idle->cpu = cpu;
2238 +       double_rq_unlock(idle_rq, rq);
2239 +       idle->need_resched = 1;
2240 +       __restore_flags(flags);
2241  }
2242
2243 -extern void init_timervecs (void);
2244 +extern void init_timervecs(void);
2245 +extern void timer_bh(void);
2246 +extern void tqueue_bh(void);
2247 +extern void immediate_bh(void);
2248
2249  void __init sched_init(void)
2250  {
2251 +       runqueue_t *rq;
2252 +       int i, j, k;
2253 +
2254 +       for (i = 0; i < NR_CPUS; i++) {
2255 +               runqueue_t *rq = cpu_rq(i);
2256 +               prio_array_t *array;
2257 +
2258 +               rq->active = rq->arrays + 0;
2259 +               rq->expired = rq->arrays + 1;
2260 +               spin_lock_init(&rq->lock);
2261 +
2262 +               for (j = 0; j < 2; j++) {
2263 +                       array = rq->arrays + j;
2264 +                       array->rq = rq;
2265 +                       array->lock = &rq->lock;
2266 +                       for (k = 0; k < MAX_PRIO; k++) {
2267 +                               INIT_LIST_HEAD(array->queue + k);
2268 +                               __clear_bit(k, array->bitmap);
2269 +                       }
2270 +                       // delimiter for bitsearch
2271 +                       __set_bit(MAX_PRIO, array->bitmap);
2272 +               }
2273 +       }
2274         /*
2275          * We have to do a little magic to get the first
2276          * process right in SMP mode.
2277          */
2278 -       int cpu = smp_processor_id();
2279 -       int nr;
2280 -
2281 -       init_task.processor = cpu;
2282 -
2283 -       for(nr = 0; nr < PIDHASH_SZ; nr++)
2284 -               pidhash[nr] = NULL;
2285 +       rq = this_rq();
2286 +       rq->curr = current;
2287 +       rq->idle = current;
2288 +       wake_up_process(current);
2289
2290         init_timervecs();
2291 -
2292         init_bh(TIMER_BH, timer_bh);
2293         init_bh(TQUEUE_BH, tqueue_bh);
2294         init_bh(IMMEDIATE_BH, immediate_bh);
2295 @@ -1340,5 +1526,5 @@
2296          * The boot idle thread does lazy MMU switching as well:
2297          */
2298         atomic_inc(&init_mm.mm_count);
2299 -       enter_lazy_tlb(&init_mm, current, cpu);
2300 +       enter_lazy_tlb(&init_mm, current, smp_processor_id());
2301  }
2302 --- linux/kernel/exit.c.orig    Sun Jan  6 13:55:56 2002
2303 +++ linux/kernel/exit.c Mon Feb  4 04:09:19 2002
2304 @@ -27,49 +27,22 @@
2305
2306  static void release_task(struct task_struct * p)
2307  {
2308 -       if (p != current) {
2309 +       if (p == current)
2310 +               BUG();
2311  #ifdef CONFIG_SMP
2312 -               /*
2313 -                * Wait to make sure the process isn't on the
2314 -                * runqueue (active on some other CPU still)
2315 -                */
2316 -               for (;;) {
2317 -                       task_lock(p);
2318 -                       if (!task_has_cpu(p))
2319 -                               break;
2320 -                       task_unlock(p);
2321 -                       do {
2322 -                               cpu_relax();
2323 -                               barrier();
2324 -                       } while (task_has_cpu(p));
2325 -               }
2326 -               task_unlock(p);
2327 +       wait_task_inactive(p);
2328  #endif
2329 -               atomic_dec(&p->user->processes);
2330 -               free_uid(p->user);
2331 -               unhash_process(p);
2332 -
2333 -               release_thread(p);
2334 -               current->cmin_flt += p->min_flt + p->cmin_flt;
2335 -               current->cmaj_flt += p->maj_flt + p->cmaj_flt;
2336 -               current->cnswap += p->nswap + p->cnswap;
2337 -               /*
2338 -                * Potentially available timeslices are retrieved
2339 -                * here - this way the parent does not get penalized
2340 -                * for creating too many processes.
2341 -                *
2342 -                * (this cannot be used to artificially 'generate'
2343 -                * timeslices, because any timeslice recovered here
2344 -                * was given away by the parent in the first place.)
2345 -                */
2346 -               current->counter += p->counter;
2347 -               if (current->counter >= MAX_COUNTER)
2348 -                       current->counter = MAX_COUNTER;
2349 -               p->pid = 0;
2350 -               free_task_struct(p);
2351 -       } else {
2352 -               printk("task releasing itself\n");
2353 -       }
2354 +       atomic_dec(&p->user->processes);
2355 +       free_uid(p->user);
2356 +       unhash_process(p);
2357 +
2358 +       release_thread(p);
2359 +       current->cmin_flt += p->min_flt + p->cmin_flt;
2360 +       current->cmaj_flt += p->maj_flt + p->cmaj_flt;
2361 +       current->cnswap += p->nswap + p->cnswap;
2362 +       sched_exit(p);
2363 +       p->pid = 0;
2364 +       free_task_struct(p);
2365  }
2366
2367  /*
2368 @@ -147,6 +120,79 @@
2369         }
2370         read_unlock(&tasklist_lock);
2371         return retval;
2372 +}
2373 +
2374 +/**
2375 + * reparent_to_init() - Reparent the calling kernel thread to the init task.
2376 + *
2377 + * If a kernel thread is launched as a result of a system call, or if
2378 + * it ever exits, it should generally reparent itself to init so that
2379 + * it is correctly cleaned up on exit.
2380 + *
2381 + * The various task state such as scheduling policy and priority may have
2382 + * been inherited from a user process, so we reset them to sane values here.
2383 + *
2384 + * NOTE that reparent_to_init() gives the caller full capabilities.
2385 + */
2386 +void reparent_to_init(void)
2387 +{
2388 +       write_lock_irq(&tasklist_lock);
2389 +
2390 +       /* Reparent to init */
2391 +       REMOVE_LINKS(current);
2392 +       current->p_pptr = child_reaper;
2393 +       current->p_opptr = child_reaper;
2394 +       SET_LINKS(current);
2395 +
2396 +       /* Set the exit signal to SIGCHLD so we signal init on exit */
2397 +       current->exit_signal = SIGCHLD;
2398 +
2399 +       current->ptrace = 0;
2400 +       if ((current->policy == SCHED_OTHER) && (task_nice(current) < 0))
2401 +               set_user_nice(current, 0);
2402 +       /* cpus_allowed? */
2403 +       /* rt_priority? */
2404 +       /* signals? */
2405 +       current->cap_effective = CAP_INIT_EFF_SET;
2406 +       current->cap_inheritable = CAP_INIT_INH_SET;
2407 +       current->cap_permitted = CAP_FULL_SET;
2408 +       current->keep_capabilities = 0;
2409 +       memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
2410 +       current->user = INIT_USER;
2411 +
2412 +       write_unlock_irq(&tasklist_lock);
2413 +}
2414 +
2415 +/*
2416 + *     Put all the gunge required to become a kernel thread without
2417 + *     attached user resources in one place where it belongs.
2418 + */
2419 +
2420 +void daemonize(void)
2421 +{
2422 +       struct fs_struct *fs;
2423 +
2424 +
2425 +       /*
2426 +        * If we were started as result of loading a module, close all of the
2427 +        * user space pages.  We don't need them, and if we didn't close them
2428 +        * they would be locked into memory.
2429 +        */
2430 +       exit_mm(current);
2431 +
2432 +       current->session = 1;
2433 +       current->pgrp = 1;
2434 +       current->tty = NULL;
2435 +
2436 +       /* Become as one with the init task */
2437 +
2438 +       exit_fs(current);       /* current->fs->count--; */
2439 +       fs = init_task.fs;
2440 +       current->fs = fs;
2441 +       atomic_inc(&fs->count);
2442 +       exit_files(current);
2443 +       current->files = init_task.files;
2444 +       atomic_inc(&current->files->count);
2445  }
2446
2447  /*
2448 --- linux/kernel/capability.c.orig      Sat Jun 24 06:06:37 2000
2449 +++ linux/kernel/capability.c   Mon Feb  4 04:09:19 2002
2450 @@ -8,6 +8,8 @@
2451  #include <linux/mm.h>
2452  #include <asm/uaccess.h>
2453
2454 +unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
2455 +
2456  kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
2457
2458  /* Note: never hold tasklist_lock while spinning for this one */
2459 --- linux/kernel/timer.c.orig   Sun Jan  6 13:55:49 2002
2460 +++ linux/kernel/timer.c        Mon Feb  4 04:09:19 2002
2461 @@ -25,6 +25,8 @@
2462
2463  #include <asm/uaccess.h>
2464
2465 +struct kernel_stat kstat;
2466 +
2467  /*
2468   * Timekeeping variables
2469   */
2470 @@ -582,18 +584,7 @@
2471         int cpu = smp_processor_id(), system = user_tick ^ 1;
2472
2473         update_one_process(p, user_tick, system, cpu);
2474 -       if (p->pid) {
2475 -               if (--p->counter <= 0) {
2476 -                       p->counter = 0;
2477 -                       p->need_resched = 1;
2478 -               }
2479 -               if (p->nice > 0)
2480 -                       kstat.per_cpu_nice[cpu] += user_tick;
2481 -               else
2482 -                       kstat.per_cpu_user[cpu] += user_tick;
2483 -               kstat.per_cpu_system[cpu] += system;
2484 -       } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
2485 -               kstat.per_cpu_system[cpu] += system;
2486 +       scheduler_tick(user_tick, system);
2487  }
2488
2489  /*
2490 @@ -794,6 +785,89 @@
2491
2492  #endif
2493
2494 +static void process_timeout(unsigned long __data)
2495 +{
2496 +       wake_up_process((task_t *)__data);
2497 +}
2498 +
2499 +/**
2500 + * schedule_timeout - sleep until timeout
2501 + * @timeout: timeout value in jiffies
2502 + *
2503 + * Make the current task sleep until @timeout jiffies have
2504 + * elapsed. The routine will return immediately unless
2505 + * the current task state has been set (see set_current_state()).
2506 + *
2507 + * You can set the task state as follows -
2508 + *
2509 + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
2510 + * pass before the routine returns. The routine will return 0
2511 + *
2512 + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
2513 + * delivered to the current task. In this case the remaining time
2514 + * in jiffies will be returned, or 0 if the timer expired in time
2515 + *
2516 + * The current task state is guaranteed to be TASK_RUNNING when this
2517 + * routine returns.
2518 + *
2519 + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
2520 + * the CPU away without a bound on the timeout. In this case the return
2521 + * value will be %MAX_SCHEDULE_TIMEOUT.
2522 + *
2523 + * In all cases the return value is guaranteed to be non-negative.
2524 + */
2525 +signed long schedule_timeout(signed long timeout)
2526 +{
2527 +       struct timer_list timer;
2528 +       unsigned long expire;
2529 +
2530 +       switch (timeout)
2531 +       {
2532 +       case MAX_SCHEDULE_TIMEOUT:
2533 +               /*
2534 +                * These two special cases are useful to be comfortable
2535 +                * in the caller. Nothing more. We could take
2536 +                * MAX_SCHEDULE_TIMEOUT from one of the negative value
2537 +                * but I' d like to return a valid offset (>=0) to allow
2538 +                * the caller to do everything it want with the retval.
2539 +                */
2540 +               schedule();
2541 +               goto out;
2542 +       default:
2543 +               /*
2544 +                * Another bit of PARANOID. Note that the retval will be
2545 +                * 0 since no piece of kernel is supposed to do a check
2546 +                * for a negative retval of schedule_timeout() (since it
2547 +                * should never happens anyway). You just have the printk()
2548 +                * that will tell you if something is gone wrong and where.
2549 +                */
2550 +               if (timeout < 0)
2551 +               {
2552 +                       printk(KERN_ERR "schedule_timeout: wrong timeout "
2553 +                              "value %lx from %p\n", timeout,
2554 +                              __builtin_return_address(0));
2555 +                       current->state = TASK_RUNNING;
2556 +                       goto out;
2557 +               }
2558 +       }
2559 +
2560 +       expire = timeout + jiffies;
2561 +
2562 +       init_timer(&timer);
2563 +       timer.expires = expire;
2564 +       timer.data = (unsigned long) current;
2565 +       timer.function = process_timeout;
2566 +
2567 +       add_timer(&timer);
2568 +       schedule();
2569 +       del_timer_sync(&timer);
2570 +
2571 +       timeout = expire - jiffies;
2572 +
2573 + out:
2574 +       return timeout < 0 ? 0 : timeout;
2575 +}
2576 +
2577  /* Thread ID - the internal kernel "pid" */
2578  asmlinkage long sys_gettid(void)
2579  {
2580 @@ -840,4 +914,3 @@
2581         }
2582         return 0;
2583  }
2584 -
2585 --- linux/kernel/fork.c.orig    Sun Jan  6 13:55:56 2002
2586 +++ linux/kernel/fork.c Mon Feb  4 04:09:19 2002
2587 @@ -28,7 +28,6 @@
2588
2589  /* The idle threads do not count.. */
2590  int nr_threads;
2591 -int nr_running;
2592
2593  int max_threads;
2594  unsigned long total_forks;     /* Handle normal Linux uptimes. */
2595 @@ -36,6 +35,8 @@
2596
2597  struct task_struct *pidhash[PIDHASH_SZ];
2598
2599 +rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;  /* outer */
2600 +
2601  void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
2602  {
2603         unsigned long flags;
2604 @@ -563,6 +564,7 @@
2605             struct pt_regs *regs, unsigned long stack_size)
2606  {
2607         int retval;
2608 +       unsigned long flags;
2609         struct task_struct *p;
2610         struct completion vfork;
2611
2612 @@ -611,8 +613,7 @@
2613         copy_flags(clone_flags, p);
2614         p->pid = get_pid(clone_flags);
2615
2616 -       p->run_list.next = NULL;
2617 -       p->run_list.prev = NULL;
2618 +       INIT_LIST_HEAD(&p->run_list);
2619
2620         p->p_cptr = NULL;
2621         init_waitqueue_head(&p->wait_chldexit);
2622 @@ -638,14 +639,15 @@
2623  #ifdef CONFIG_SMP
2624         {
2625                 int i;
2626 -               p->cpus_runnable = ~0UL;
2627 -               p->processor = current->processor;
2628 +
2629                 /* ?? should we just memset this ?? */
2630                 for(i = 0; i < smp_num_cpus; i++)
2631 -                       p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
2632 +                       p->per_cpu_utime[cpu_logical_map(i)] =
2633 +                               p->per_cpu_stime[cpu_logical_map(i)] = 0;
2634                 spin_lock_init(&p->sigmask_lock);
2635         }
2636  #endif
2637 +       p->array = NULL;
2638         p->lock_depth = -1;             /* -1 = no lock */
2639         p->start_time = jiffies;
2640
2641 @@ -677,15 +679,27 @@
2642         p->pdeath_signal = 0;
2643
2644         /*
2645 -        * "share" dynamic priority between parent and child, thus the
2646 -        * total amount of dynamic priorities in the system doesnt change,
2647 -        * more scheduling fairness. This is only important in the first
2648 -        * timeslice, on the long run the scheduling behaviour is unchanged.
2649 +        * Share the timeslice between parent and child, thus the
2650 +        * total amount of pending timeslices in the system doesnt change,
2651 +        * resulting in more scheduling fairness.
2652          */
2653 -       p->counter = (current->counter + 1) >> 1;
2654 -       current->counter >>= 1;
2655 -       if (!current->counter)
2656 -               current->need_resched = 1;
2657 +       __save_flags(flags);
2658 +       __cli();
2659 +       if (!current->time_slice)
2660 +               BUG();
2661 +       p->time_slice = (current->time_slice + 1) >> 1;
2662 +       current->time_slice >>= 1;
2663 +       if (!current->time_slice) {
2664 +               /*
2665 +                * This case is rare, it happens when the parent has only
2666 +                * a single jiffy left from its timeslice. Taking the
2667 +                * runqueue lock is not a problem.
2668 +                */
2669 +               current->time_slice = 1;
2670 +               scheduler_tick(0,0);
2671 +       }
2672 +       p->sleep_timestamp = jiffies;
2673 +       __restore_flags(flags);
2674
2675         /*
2676          * Ok, add it to the run-queues and make it
2677 @@ -722,10 +736,23 @@
2678         if (p->ptrace & PT_PTRACED)
2679                 send_sig(SIGSTOP, p, 1);
2680
2681 +#define RUN_CHILD_FIRST 1
2682 +#if RUN_CHILD_FIRST
2683 +       wake_up_forked_process(p);      /* do this last */
2684 +#else
2685         wake_up_process(p);             /* do this last */
2686 +#endif
2687         ++total_forks;
2688         if (clone_flags & CLONE_VFORK)
2689                 wait_for_completion(&vfork);
2690 +#if RUN_CHILD_FIRST
2691 +       else
2692 +               /*
2693 +                * Let the child process run first, to avoid most of the
2694 +                * COW overhead when the child exec()s afterwards.
2695 +                */
2696 +               current->need_resched = 1;
2697 +#endif
2698
2699  fork_out:
2700         return retval;
2701 --- linux/kernel/softirq.c.orig Sun Jan  6 13:55:53 2002
2702 +++ linux/kernel/softirq.c      Mon Feb  4 04:09:19 2002
2703 @@ -259,10 +259,9 @@
2704
2705         while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
2706                 current->state = TASK_RUNNING;
2707 -               do {
2708 -                       current->policy |= SCHED_YIELD;
2709 -                       schedule();
2710 -               } while (test_bit(TASKLET_STATE_SCHED, &t->state));
2711 +               do
2712 +                       sys_sched_yield();
2713 +               while (test_bit(TASKLET_STATE_SCHED, &t->state));
2714         }
2715         tasklet_unlock_wait(t);
2716         clear_bit(TASKLET_STATE_SCHED, &t->state);
2717 @@ -365,13 +364,13 @@
2718         int cpu = cpu_logical_map(bind_cpu);
2719
2720         daemonize();
2721 -       current->nice = 19;
2722 +       set_user_nice(current, 19);
2723         sigfillset(&current->blocked);
2724
2725         /* Migrate to the right CPU */
2726 -       current->cpus_allowed = 1UL << cpu;
2727 -       while (smp_processor_id() != cpu)
2728 -               schedule();
2729 +       set_cpus_allowed(current, 1UL << cpu);
2730 +       if (cpu() != cpu)
2731 +               BUG();
2732
2733         sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu);
2734
2735 @@ -396,7 +395,7 @@
2736         }
2737  }
2738
2739 -static __init int spawn_ksoftirqd(void)
2740 +__init int spawn_ksoftirqd(void)
2741  {
2742         int cpu;
2743
2744 @@ -405,14 +404,12 @@
2745                                   CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0)
2746                         printk("spawn_ksoftirqd() failed for cpu %d\n", cpu);
2747                 else {
2748 -                       while (!ksoftirqd_task(cpu_logical_map(cpu))) {
2749 -                               current->policy |= SCHED_YIELD;
2750 -                               schedule();
2751 -                       }
2752 +                       while (!ksoftirqd_task(cpu_logical_map(cpu)))
2753 +                               sys_sched_yield();
2754                 }
2755         }
2756
2757         return 0;
2758  }
2759
2760 -__initcall(spawn_ksoftirqd);
2761 +//__initcall(spawn_ksoftirqd);
2762 --- linux/kernel/ptrace.c.orig  Sun Jan  6 13:55:57 2002
2763 +++ linux/kernel/ptrace.c       Mon Feb  4 04:09:19 2002
2764 @@ -31,20 +31,7 @@
2765                 if (child->state != TASK_STOPPED)
2766                         return -ESRCH;
2767  #ifdef CONFIG_SMP
2768 -               /* Make sure the child gets off its CPU.. */
2769 -               for (;;) {
2770 -                       task_lock(child);
2771 -                       if (!task_has_cpu(child))
2772 -                               break;
2773 -                       task_unlock(child);
2774 -                       do {
2775 -                               if (child->state != TASK_STOPPED)
2776 -                                       return -ESRCH;
2777 -                               barrier();
2778 -                               cpu_relax();
2779 -                       } while (task_has_cpu(child));
2780 -               }
2781 -               task_unlock(child);
2782 +               wait_task_inactive(child);
2783  #endif
2784         }
2785
2786 --- linux/kernel/sys.c.orig     Sun Jan  6 13:55:47 2002
2787 +++ linux/kernel/sys.c  Mon Feb  4 04:09:19 2002
2788 @@ -220,10 +220,10 @@
2789                 }
2790                 if (error == -ESRCH)
2791                         error = 0;
2792 -               if (niceval < p->nice && !capable(CAP_SYS_NICE))
2793 +               if (niceval < task_nice(p) && !capable(CAP_SYS_NICE))
2794                         error = -EACCES;
2795                 else
2796 -                       p->nice = niceval;
2797 +                       set_user_nice(p, niceval);
2798         }
2799         read_unlock(&tasklist_lock);
2800
2801 @@ -249,7 +249,7 @@
2802                 long niceval;
2803                 if (!proc_sel(p, which, who))
2804                         continue;
2805 -               niceval = 20 - p->nice;
2806 +               niceval = 20 - task_nice(p);
2807                 if (niceval > retval)
2808                         retval = niceval;
2809         }
2810 --- linux/kernel/signal.c.orig  Sun Jan  6 13:55:56 2002
2811 +++ linux/kernel/signal.c       Mon Feb  4 04:09:19 2002
2812 @@ -478,12 +478,9 @@
2813          * process of changing - but no harm is done by that
2814          * other than doing an extra (lightweight) IPI interrupt.
2815          */
2816 -       spin_lock(&runqueue_lock);
2817 -       if (task_has_cpu(t) && t->processor != smp_processor_id())
2818 -               smp_send_reschedule(t->processor);
2819 -       spin_unlock(&runqueue_lock);
2820 -#endif /* CONFIG_SMP */
2821 -
2822 +       if ((t->state == TASK_RUNNING) && (t->cpu != cpu()))
2823 +               kick_if_running(t);
2824 +#endif
2825         if (t->state & TASK_INTERRUPTIBLE) {
2826                 wake_up_process(t);
2827                 return;
2828 --- linux/kernel/printk.c.orig  Sun Jan  6 13:55:57 2002
2829 +++ linux/kernel/printk.c       Mon Feb  4 04:09:19 2002
2830 @@ -25,6 +25,7 @@
2831  #include <linux/module.h>
2832  #include <linux/interrupt.h>                   /* For in_interrupt() */
2833  #include <linux/config.h>
2834 +#include <linux/delay.h>
2835
2836  #include <asm/uaccess.h>
2837
2838 --- linux/kernel/ksyms.c.orig   Sun Jan  6 13:55:57 2002
2839 +++ linux/kernel/ksyms.c        Mon Feb  4 04:09:19 2002
2840 @@ -437,6 +437,9 @@
2841  EXPORT_SYMBOL(interruptible_sleep_on_timeout);
2842  EXPORT_SYMBOL(schedule);
2843  EXPORT_SYMBOL(schedule_timeout);
2844 +EXPORT_SYMBOL(sys_sched_yield);
2845 +EXPORT_SYMBOL(set_user_nice);
2846 +EXPORT_SYMBOL(set_cpus_allowed);
2847  EXPORT_SYMBOL(jiffies);
2848  EXPORT_SYMBOL(xtime);
2849  EXPORT_SYMBOL(do_gettimeofday);
2850 @@ -448,6 +451,7 @@
2851
2852  EXPORT_SYMBOL(kstat);
2853  EXPORT_SYMBOL(nr_running);
2854 +EXPORT_SYMBOL(nr_context_switches);
2855
2856  /* misc */
2857  EXPORT_SYMBOL(panic);
2858 --- linux/mm/oom_kill.c.orig    Sun Jan  6 13:55:53 2002
2859 +++ linux/mm/oom_kill.c Mon Feb  4 04:09:19 2002
2860 @@ -82,7 +82,7 @@
2861          * Niced processes are most likely less important, so double
2862          * their badness points.
2863          */
2864 -       if (p->nice > 0)
2865 +       if (task_nice(p) > 0)
2866                 points *= 2;
2867
2868         /*
2869 @@ -149,7 +149,7 @@
2870          * all the memory it needs. That way it should be able to
2871          * exit() and clear out its resources quickly...
2872          */
2873 -       p->counter = 5 * HZ;
2874 +       p->time_slice = HZ;
2875         p->flags |= PF_MEMALLOC | PF_MEMDIE;
2876
2877         /* This process has hardware access, be more careful. */
2878 @@ -188,8 +188,7 @@
2879          * killing itself before someone else gets the chance to ask
2880          * for more memory.
2881          */
2882 -       current->policy |= SCHED_YIELD;
2883 -       schedule();
2884 +       yield();
2885         return;
2886  }
2887
2888 --- linux/mm/page_alloc.c.orig  Sun Jan  6 13:55:56 2002
2889 +++ linux/mm/page_alloc.c       Mon Feb  4 04:09:19 2002
2890 @@ -394,9 +394,8 @@
2891                 return NULL;
2892
2893         /* Yield for kswapd, and try again */
2894 -       current->policy |= SCHED_YIELD;
2895         __set_current_state(TASK_RUNNING);
2896 -       schedule();
2897 +       yield();
2898         goto rebalance;
2899  }
2900
2901 --- linux/mm/highmem.c.orig     Sun Jan  6 13:55:57 2002
2902 +++ linux/mm/highmem.c  Mon Feb  4 04:09:19 2002
2903 @@ -354,9 +354,8 @@
2904         /* we need to wait I/O completion */
2905         run_task_queue(&tq_disk);
2906
2907 -       current->policy |= SCHED_YIELD;
2908         __set_current_state(TASK_RUNNING);
2909 -       schedule();
2910 +       yield();
2911         goto repeat_alloc;
2912  }
2913
2914 @@ -392,9 +391,8 @@
2915         /* we need to wait I/O completion */
2916         run_task_queue(&tq_disk);
2917
2918 -       current->policy |= SCHED_YIELD;
2919         __set_current_state(TASK_RUNNING);
2920 -       schedule();
2921 +       yield();
2922         goto repeat_alloc;
2923  }
2924
2925 --- linux/include/linux/sched.h.orig    Sun Jan  6 13:55:57 2002
2926 +++ linux/include/linux/sched.h Mon Feb  4 04:09:19 2002
2927 @@ -6,6 +6,7 @@
2928  extern unsigned long event;
2929
2930  #include <linux/config.h>
2931 +#include <linux/compiler.h>
2932  #include <linux/binfmts.h>
2933  #include <linux/threads.h>
2934  #include <linux/kernel.h>
2935 @@ -42,6 +43,7 @@
2936  #define CLONE_VFORK    0x00004000      /* set if the parent wants the child to wake it up on mm_release */
2937  #define CLONE_PARENT   0x00008000      /* set if we want to have the same parent as the cloner */
2938  #define CLONE_THREAD   0x00010000      /* Same thread group? */
2939 +#define CLONE_NEWNS    0x00020000      /* New namespace group? */
2940
2941  #define CLONE_SIGNAL   (CLONE_SIGHAND | CLONE_THREAD)
2942
2943 @@ -72,8 +74,9 @@
2944  #define CT_TO_SECS(x)  ((x) / HZ)
2945  #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ)
2946
2947 -extern int nr_running, nr_threads;
2948 +extern int nr_threads;
2949  extern int last_pid;
2950 +extern unsigned long nr_running(void);
2951
2952  #include <linux/fs.h>
2953  #include <linux/time.h>
2954 @@ -116,12 +119,6 @@
2955  #define SCHED_FIFO             1
2956  #define SCHED_RR               2
2957
2958 -/*
2959 - * This is an additional bit set when we want to
2960 - * yield the CPU for one re-schedule..
2961 - */
2962 -#define SCHED_YIELD            0x10
2963 -
2964  struct sched_param {
2965         int sched_priority;
2966  };
2967 @@ -139,17 +136,22 @@
2968   * a separate lock).
2969   */
2970  extern rwlock_t tasklist_lock;
2971 -extern spinlock_t runqueue_lock;
2972  extern spinlock_t mmlist_lock;
2973
2974 +typedef struct task_struct task_t;
2975 +
2976  extern void sched_init(void);
2977 -extern void init_idle(void);
2978 +extern void init_idle(task_t *idle, int cpu);
2979  extern void show_state(void);
2980  extern void cpu_init (void);
2981  extern void trap_init(void);
2982  extern void update_process_times(int user);
2983 -extern void update_one_process(struct task_struct *p, unsigned long user,
2984 +extern void update_one_process(task_t *p, unsigned long user,
2985                                unsigned long system, int cpu);
2986 +extern void scheduler_tick(int user_tick, int system);
2987 +extern void sched_task_migrated(task_t *p);
2988 +extern void smp_migrate_task(int cpu, task_t *task);
2989 +extern unsigned long cache_decay_ticks;
2990
2991  #define        MAX_SCHEDULE_TIMEOUT    LONG_MAX
2992  extern signed long FASTCALL(schedule_timeout(signed long timeout));
2993 @@ -166,6 +168,7 @@
2994   */
2995  #define NR_OPEN_DEFAULT BITS_PER_LONG
2996
2997 +struct namespace;
2998  /*
2999   * Open file table structure
3000   */
3001 @@ -278,6 +281,8 @@
3002  extern struct user_struct root_user;
3003  #define INIT_USER (&root_user)
3004
3005 +typedef struct prio_array prio_array_t;
3006 +
3007  struct task_struct {
3008         /*
3009          * offsets of these are hardcoded elsewhere - touch with care
3010 @@ -295,35 +300,26 @@
3011
3012         int lock_depth;         /* Lock depth */
3013
3014 -/*
3015 - * offset 32 begins here on 32-bit platforms. We keep
3016 - * all fields in a single cacheline that are needed for
3017 - * the goodness() loop in schedule().
3018 - */
3019 -       long counter;
3020 -       long nice;
3021 -       unsigned long policy;
3022 -       struct mm_struct *mm;
3023 -       int processor;
3024         /*
3025 -        * cpus_runnable is ~0 if the process is not running on any
3026 -        * CPU. It's (1 << cpu) if it's running on a CPU. This mask
3027 -        * is updated under the runqueue lock.
3028 -        *
3029 -        * To determine whether a process might run on a CPU, this
3030 -        * mask is AND-ed with cpus_allowed.
3031 +        * offset 32 begins here on 32-bit platforms.
3032          */
3033 -       unsigned long cpus_runnable, cpus_allowed;
3034 -       /*
3035 -        * (only the 'next' pointer fits into the cacheline, but
3036 -        * that's just fine.)
3037 -        */
3038 -       struct list_head run_list;
3039 -       unsigned long sleep_time;
3040 +       unsigned int cpu;
3041 +       int prio, static_prio;
3042 +       list_t run_list;
3043 +       prio_array_t *array;
3044 +
3045 +       unsigned long sleep_avg;
3046 +       unsigned long sleep_timestamp;
3047 +
3048 +       unsigned long policy;
3049 +       unsigned long cpus_allowed;
3050 +       unsigned int time_slice;
3051 +
3052 +       task_t *next_task, *prev_task;
3053
3054 -       struct task_struct *next_task, *prev_task;
3055 -       struct mm_struct *active_mm;
3056 +       struct mm_struct *mm, *active_mm;
3057         struct list_head local_pages;
3058 +
3059         unsigned int allocation_order, nr_local_pages;
3060
3061  /* task state */
3062 @@ -345,12 +341,12 @@
3063          * older sibling, respectively.  (p->father can be replaced with
3064          * p->p_pptr->pid)
3065          */
3066 -       struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
3067 +       task_t *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
3068         struct list_head thread_group;
3069
3070         /* PID hash table linkage. */
3071 -       struct task_struct *pidhash_next;
3072 -       struct task_struct **pidhash_pprev;
3073 +       task_t *pidhash_next;
3074 +       task_t **pidhash_pprev;
3075
3076         wait_queue_head_t wait_chldexit;        /* for wait4() */
3077         struct completion *vfork_done;          /* for vfork() */
3078 @@ -389,6 +385,8 @@
3079         struct fs_struct *fs;
3080  /* open file information */
3081         struct files_struct *files;
3082 +/* namespace */
3083 +       struct namespace *namespace;
3084  /* signal handlers */
3085         spinlock_t sigmask_lock;        /* Protects signal and blocked */
3086         struct signal_struct *sig;
3087 @@ -446,10 +444,13 @@
3088   */
3089  #define _STK_LIM       (8*1024*1024)
3090
3091 -#define DEF_COUNTER    (10*HZ/100)     /* 100 ms time slice */
3092 -#define MAX_COUNTER    (20*HZ/100)
3093 -#define DEF_NICE       (0)
3094 +extern void set_cpus_allowed(task_t *p, unsigned long new_mask);
3095 +extern void set_user_nice(task_t *p, long nice);
3096 +extern int task_prio(task_t *p);
3097 +extern int task_nice(task_t *p);
3098
3099 +asmlinkage long sys_sched_yield(void);
3100 +#define yield() sys_sched_yield()
3101
3102  /*
3103   * The default (Linux) execution domain.
3104 @@ -468,14 +469,14 @@
3105      addr_limit:                KERNEL_DS,                                      \
3106      exec_domain:       &default_exec_domain,                           \
3107      lock_depth:                -1,                                             \
3108 -    counter:           DEF_COUNTER,                                    \
3109 -    nice:              DEF_NICE,                                       \
3110 +    prio:              120,                                            \
3111 +    static_prio:       120,                                            \
3112      policy:            SCHED_OTHER,                                    \
3113 +    cpus_allowed:      -1,                                             \
3114      mm:                        NULL,                                           \
3115      active_mm:         &init_mm,                                       \
3116 -    cpus_runnable:     -1,                                             \
3117 -    cpus_allowed:      -1,                                             \
3118      run_list:          LIST_HEAD_INIT(tsk.run_list),                   \
3119 +    time_slice:                HZ,                                             \
3120      next_task:         &tsk,                                           \
3121      prev_task:         &tsk,                                           \
3122      p_opptr:           &tsk,                                           \
3123 @@ -509,24 +510,24 @@
3124  #endif
3125
3126  union task_union {
3127 -       struct task_struct task;
3128 +       task_t task;
3129         unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
3130  };
3131
3132  extern union task_union init_task_union;
3133
3134  extern struct   mm_struct init_mm;
3135 -extern struct task_struct *init_tasks[NR_CPUS];
3136 +extern task_t *init_tasks[NR_CPUS];
3137
3138  /* PID hashing. (shouldnt this be dynamic?) */
3139  #define PIDHASH_SZ (4096 >> 2)
3140 -extern struct task_struct *pidhash[PIDHASH_SZ];
3141 +extern task_t *pidhash[PIDHASH_SZ];
3142
3143  #define pid_hashfn(x)  ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
3144
3145 -static inline void hash_pid(struct task_struct *p)
3146 +static inline void hash_pid(task_t *p)
3147  {
3148 -       struct task_struct **htable = &pidhash[pid_hashfn(p->pid)];
3149 +       task_t **htable = &pidhash[pid_hashfn(p->pid)];
3150
3151         if((p->pidhash_next = *htable) != NULL)
3152                 (*htable)->pidhash_pprev = &p->pidhash_next;
3153 @@ -534,16 +535,16 @@
3154         p->pidhash_pprev = htable;
3155  }
3156
3157 -static inline void unhash_pid(struct task_struct *p)
3158 +static inline void unhash_pid(task_t *p)
3159  {
3160         if(p->pidhash_next)
3161                 p->pidhash_next->pidhash_pprev = p->pidhash_pprev;
3162         *p->pidhash_pprev = p->pidhash_next;
3163  }
3164
3165 -static inline struct task_struct *find_task_by_pid(int pid)
3166 +static inline task_t *find_task_by_pid(int pid)
3167  {
3168 -       struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)];
3169 +       task_t *p, **htable = &pidhash[pid_hashfn(pid)];
3170
3171         for(p = *htable; p && p->pid != pid; p = p->pidhash_next)
3172                 ;
3173 @@ -551,19 +552,6 @@
3174         return p;
3175  }
3176
3177 -#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL)
3178 -
3179 -static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu)
3180 -{
3181 -       tsk->processor = cpu;
3182 -       tsk->cpus_runnable = 1UL << cpu;
3183 -}
3184 -
3185 -static inline void task_release_cpu(struct task_struct *tsk)
3186 -{
3187 -       tsk->cpus_runnable = ~0UL;
3188 -}
3189 -
3190  /* per-UID process charging. */
3191  extern struct user_struct * alloc_uid(uid_t);
3192  extern void free_uid(struct user_struct *);
3193 @@ -590,7 +578,9 @@
3194  extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
3195  extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
3196                                                     signed long timeout));
3197 -extern int FASTCALL(wake_up_process(struct task_struct * tsk));
3198 +extern int FASTCALL(wake_up_process(task_t * tsk));
3199 +extern void FASTCALL(wake_up_forked_process(task_t * tsk));
3200 +extern void FASTCALL(sched_exit(task_t * p));
3201
3202  #define wake_up(x)                     __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
3203  #define wake_up_nr(x, nr)              __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
3204 @@ -608,28 +598,28 @@
3205  extern int in_egroup_p(gid_t);
3206
3207  extern void proc_caches_init(void);
3208 -extern void flush_signals(struct task_struct *);
3209 -extern void flush_signal_handlers(struct task_struct *);
3210 +extern void flush_signals(task_t *);
3211 +extern void flush_signal_handlers(task_t *);
3212  extern int dequeue_signal(sigset_t *, siginfo_t *);
3213  extern void block_all_signals(int (*notifier)(void *priv), void *priv,
3214                               sigset_t *mask);
3215  extern void unblock_all_signals(void);
3216 -extern int send_sig_info(int, struct siginfo *, struct task_struct *);
3217 -extern int force_sig_info(int, struct siginfo *, struct task_struct *);
3218 +extern int send_sig_info(int, struct siginfo *, task_t *);
3219 +extern int force_sig_info(int, struct siginfo *, task_t *);
3220  extern int kill_pg_info(int, struct siginfo *, pid_t);
3221  extern int kill_sl_info(int, struct siginfo *, pid_t);
3222  extern int kill_proc_info(int, struct siginfo *, pid_t);
3223 -extern void notify_parent(struct task_struct *, int);
3224 -extern void do_notify_parent(struct task_struct *, int);
3225 -extern void force_sig(int, struct task_struct *);
3226 -extern int send_sig(int, struct task_struct *, int);
3227 +extern void notify_parent(task_t *, int);
3228 +extern void do_notify_parent(task_t *, int);
3229 +extern void force_sig(int, task_t *);
3230 +extern int send_sig(int, task_t *, int);
3231  extern int kill_pg(pid_t, int, int);
3232  extern int kill_sl(pid_t, int, int);
3233  extern int kill_proc(pid_t, int, int);
3234  extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *);
3235  extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long);
3236
3237 -static inline int signal_pending(struct task_struct *p)
3238 +static inline int signal_pending(task_t *p)
3239  {
3240         return (p->sigpending != 0);
3241  }
3242 @@ -668,7 +658,7 @@
3243     This is required every time the blocked sigset_t changes.
3244     All callers should have t->sigmask_lock.  */
3245
3246 -static inline void recalc_sigpending(struct task_struct *t)
3247 +static inline void recalc_sigpending(task_t *t)
3248  {
3249         t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
3250  }
3251 @@ -775,16 +765,17 @@
3252  extern int expand_fdset(struct files_struct *, int nr);
3253  extern void free_fdset(fd_set *, int);
3254
3255 -extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
3256 +extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, task_t *, struct pt_regs *);
3257  extern void flush_thread(void);
3258  extern void exit_thread(void);
3259
3260 -extern void exit_mm(struct task_struct *);
3261 -extern void exit_files(struct task_struct *);
3262 -extern void exit_sighand(struct task_struct *);
3263 +extern void exit_mm(task_t *);
3264 +extern void exit_files(task_t *);
3265 +extern void exit_sighand(task_t *);
3266
3267  extern void reparent_to_init(void);
3268  extern void daemonize(void);
3269 +extern task_t *child_reaper;
3270
3271  extern int do_execve(char *, char **, char **, struct pt_regs *);
3272  extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long);
3273 @@ -793,6 +784,9 @@
3274  extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
3275  extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
3276
3277 +extern void wait_task_inactive(task_t * p);
3278 +extern void kick_if_running(task_t * p);
3279 +
3280  #define __wait_event(wq, condition)                                    \
3281  do {                                                                   \
3282         wait_queue_t __wait;                                            \
3283 @@ -871,24 +865,10 @@
3284         for (p = &init_task ; (p = p->next_task) != &init_task ; )
3285
3286  #define next_thread(p) \
3287 -       list_entry((p)->thread_group.next, struct task_struct, thread_group)
3288 -
3289 -static inline void del_from_runqueue(struct task_struct * p)
3290 -{
3291 -       nr_running--;
3292 -       p->sleep_time = jiffies;
3293 -       list_del(&p->run_list);
3294 -       p->run_list.next = NULL;
3295 -}
3296 -
3297 -static inline int task_on_runqueue(struct task_struct *p)
3298 -{
3299 -       return (p->run_list.next != NULL);
3300 -}
3301 +       list_entry((p)->thread_group.next, task_t, thread_group)
3302
3303 -static inline void unhash_process(struct task_struct *p)
3304 +static inline void unhash_process(task_t *p)
3305  {
3306 -       if (task_on_runqueue(p)) BUG();
3307         write_lock_irq(&tasklist_lock);
3308         nr_threads--;
3309         unhash_pid(p);
3310 @@ -898,12 +878,12 @@
3311  }
3312
3313  /* Protects ->fs, ->files, ->mm, and synchronises with wait4().  Nests inside tasklist_lock */
3314 -static inline void task_lock(struct task_struct *p)
3315 +static inline void task_lock(task_t *p)
3316  {
3317         spin_lock(&p->alloc_lock);
3318  }
3319
3320 -static inline void task_unlock(struct task_struct *p)
3321 +static inline void task_unlock(task_t *p)
3322  {
3323         spin_unlock(&p->alloc_lock);
3324  }
3325 --- linux/include/linux/list.h.orig     Sun Jan  6 13:55:57 2002
3326 +++ linux/include/linux/list.h  Mon Feb  4 04:09:19 2002
3327 @@ -19,6 +19,8 @@
3328         struct list_head *next, *prev;
3329  };
3330
3331 +typedef struct list_head list_t;
3332 +
3333  #define LIST_HEAD_INIT(name) { &(name), &(name) }
3334
3335  #define LIST_HEAD(name) \
3336 --- linux/include/linux/kernel_stat.h.orig      Tue Aug 21 14:26:23 2001
3337 +++ linux/include/linux/kernel_stat.h   Mon Feb  4 04:09:19 2002
3338 @@ -32,10 +32,11 @@
3339         unsigned int ipackets, opackets;
3340         unsigned int ierrors, oerrors;
3341         unsigned int collisions;
3342 -       unsigned int context_swtch;
3343  };
3344
3345  extern struct kernel_stat kstat;
3346 +
3347 +extern unsigned long nr_context_switches(void);
3348
3349  #if !defined(CONFIG_ARCH_S390)
3350  /*
3351 --- linux/include/linux/smp.h.orig      Sun Dec 31 20:10:17 2000
3352 +++ linux/include/linux/smp.h   Mon Feb  4 04:09:19 2002
3353 @@ -86,6 +86,14 @@
3354  #define cpu_number_map(cpu)                    0
3355  #define smp_call_function(func,info,retry,wait)        ({ 0; })
3356  #define cpu_online_map                         1
3357 +static inline void smp_send_reschedule(int cpu) { }
3358 +static inline void smp_send_reschedule_all(void) { }
3359
3360  #endif
3361 +
3362 +/*
3363 + * Common definitions:
3364 + */
3365 +#define cpu()                                  smp_processor_id()
3366 +
3367  #endif
3368 --- linux/include/asm-i386/smp.h.orig   Sun Jan  6 13:55:57 2002
3369 +++ linux/include/asm-i386/smp.h        Mon Feb  4 04:09:19 2002
3370 @@ -63,6 +63,7 @@
3371  extern void smp_flush_tlb(void);
3372  extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
3373  extern void smp_send_reschedule(int cpu);
3374 +extern void smp_send_reschedule_all(void);
3375  extern void smp_invalidate_rcv(void);          /* Process an NMI */
3376  extern void (*mtrr_hook) (void);
3377  extern void zap_low_mappings (void);
3378 @@ -104,7 +105,7 @@
3379   * so this is correct in the x86 case.
3380   */
3381
3382 -#define smp_processor_id() (current->processor)
3383 +#define smp_processor_id() (current->cpu)
3384
3385  static __inline int hard_smp_processor_id(void)
3386  {
3387 @@ -121,18 +122,6 @@
3388  #endif /* !__ASSEMBLY__ */
3389
3390  #define NO_PROC_ID             0xFF            /* No processor magic marker */
3391 -
3392 -/*
3393 - *     This magic constant controls our willingness to transfer
3394 - *     a process across CPUs. Such a transfer incurs misses on the L1
3395 - *     cache, and on a P6 or P5 with multiple L2 caches L2 hits. My
3396 - *     gut feeling is this will vary by board in value. For a board
3397 - *     with separate L2 cache it probably depends also on the RSS, and
3398 - *     for a board with shared L2 cache it ought to decay fast as other
3399 - *     processes are run.
3400 - */
3401 -
3402 -#define PROC_CHANGE_PENALTY    15              /* Schedule penalty */
3403
3404  #endif
3405  #endif
3406 --- linux/include/asm-i386/bitops.h.orig        Tue Aug 21 14:26:16 2001
3407 +++ linux/include/asm-i386/bitops.h     Mon Feb  4 04:09:19 2002
3408 @@ -75,6 +75,14 @@
3409                 :"=m" (ADDR)
3410                 :"Ir" (nr));
3411  }
3412 +
3413 +static __inline__ void __clear_bit(int nr, volatile void * addr)
3414 +{
3415 +       __asm__ __volatile__(
3416 +               "btrl %1,%0"
3417 +               :"=m" (ADDR)
3418 +               :"Ir" (nr));
3419 +}
3420  #define smp_mb__before_clear_bit()     barrier()
3421  #define smp_mb__after_clear_bit()      barrier()
3422
3423 @@ -284,6 +292,34 @@
3424  }
3425
3426  /**
3427 + * find_first_bit - find the first set bit in a memory region
3428 + * @addr: The address to start the search at
3429 + * @size: The maximum size to search
3430 + *
3431 + * Returns the bit-number of the first set bit, not the number of the byte
3432 + * containing a bit.
3433 + */
3434 +static __inline__ int find_first_bit(void * addr, unsigned size)
3435 +{
3436 +       int d0, d1;
3437 +       int res;
3438 +
3439 +       /* This looks at memory. Mark it volatile to tell gcc not to move it around */
3440 +       __asm__ __volatile__(
3441 +               "xorl %%eax,%%eax\n\t"
3442 +               "repe; scasl\n\t"
3443 +               "jz 1f\n\t"
3444 +               "leal -4(%%edi),%%edi\n\t"
3445 +               "bsfl (%%edi),%%eax\n"
3446 +               "1:\tsubl %%ebx,%%edi\n\t"
3447 +               "shll $3,%%edi\n\t"
3448 +               "addl %%edi,%%eax"
3449 +               :"=a" (res), "=&c" (d0), "=&D" (d1)
3450 +               :"1" ((size + 31) >> 5), "2" (addr), "b" (addr));
3451 +       return res;
3452 +}
3453 +
3454 +/**
3455   * find_next_zero_bit - find the first zero bit in a memory region
3456   * @addr: The address to base the search on
3457   * @offset: The bitnumber to start searching at
3458 @@ -296,7 +332,7 @@
3459
3460         if (bit) {
3461                 /*
3462 -                * Look for zero in first byte
3463 +                * Look for zero in the first 32 bits.
3464                  */
3465                 __asm__("bsfl %1,%0\n\t"
3466                         "jne 1f\n\t"
3467 @@ -317,6 +353,39 @@
3468  }
3469
3470  /**
3471 + * find_next_bit - find the first set bit in a memory region
3472 + * @addr: The address to base the search on
3473 + * @offset: The bitnumber to start searching at
3474 + * @size: The maximum size to search
3475 + */
3476 +static __inline__ int find_next_bit (void * addr, int size, int offset)
3477 +{
3478 +       unsigned long * p = ((unsigned long *) addr) + (offset >> 5);
3479 +       int set = 0, bit = offset & 31, res;
3480 +
3481 +       if (bit) {
3482 +               /*
3483 +                * Look for nonzero in the first 32 bits:
3484 +                */
3485 +               __asm__("bsfl %1,%0\n\t"
3486 +                       "jne 1f\n\t"
3487 +                       "movl $32, %0\n"
3488 +                       "1:"
3489 +                       : "=r" (set)
3490 +                       : "r" (*p >> bit));
3491 +               if (set < (32 - bit))
3492 +                       return set + offset;
3493 +               set = 32 - bit;
3494 +               p++;
3495 +       }
3496 +       /*
3497 +        * No set bit yet, search remaining full words for a bit
3498 +        */
3499 +       res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr));
3500 +       return (offset + set + res);
3501 +}
3502 +
3503 +/**
3504   * ffz - find first zero in word.
3505   * @word: The word to search
3506   *
3507 @@ -327,6 +396,20 @@
3508         __asm__("bsfl %1,%0"
3509                 :"=r" (word)
3510                 :"r" (~word));
3511 +       return word;
3512 +}
3513 +
3514 +/**
3515 + * __ffs - find first bit in word.
3516 + * @word: The word to search
3517 + *
3518 + * Undefined if no bit exists, so code should check against 0 first.
3519 + */
3520 +static __inline__ unsigned long __ffs(unsigned long word)
3521 +{
3522 +       __asm__("bsfl %1,%0"
3523 +               :"=r" (word)
3524 +               :"rm" (word));
3525         return word;
3526  }
3527
3528 --- linux/include/asm-i386/pgalloc.h.orig       Sun Jan  6 13:55:57 2002
3529 +++ linux/include/asm-i386/pgalloc.h    Mon Feb  4 04:09:19 2002
3530 @@ -224,6 +224,7 @@
3531  {
3532         struct mm_struct *active_mm;
3533         int state;
3534 +       char __cacheline_padding[24];
3535  };
3536  extern struct tlb_state cpu_tlbstate[NR_CPUS];
3537
3538 --- linux/include/asm-i386/mmu_context.h.orig   Tue Aug 21 14:26:23 2001
3539 +++ linux/include/asm-i386/mmu_context.h        Mon Feb  4 04:09:19 2002
3540 @@ -7,6 +7,25 @@
3541  #include <asm/pgalloc.h>
3542
3543  /*
3544 + * Every architecture must define this function. It's the fastest
3545 + * way of searching a 140-bit bitmap where the first 100 bits are
3546 + * unlikely to be set. It's guaranteed that at least one of the 140
3547 + * bits is cleared.
3548 + */
3549 +static inline int sched_find_first_bit(unsigned long *b)
3550 +{
3551 +       if (unlikely(b[0]))
3552 +               return __ffs(b[0]);
3553 +       if (unlikely(b[1]))
3554 +               return __ffs(b[1]) + 32;
3555 +       if (unlikely(b[2]))
3556 +               return __ffs(b[2]) + 64;
3557 +       if (b[3])
3558 +               return __ffs(b[3]) + 96;
3559 +       return __ffs(b[4]) + 128;
3560 +}
3561 +
3562 +/*
3563   * possibly do the LDT unload here?
3564   */
3565  #define destroy_context(mm)            do { } while(0)
3566 @@ -27,13 +46,13 @@
3567
3568  static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu)
3569  {
3570 -       if (prev != next) {
3571 +       if (likely(prev != next)) {
3572                 /* stop flush ipis for the previous mm */
3573                 clear_bit(cpu, &prev->cpu_vm_mask);
3574                 /*
3575                  * Re-load LDT if necessary
3576                  */
3577 -               if (prev->context.segments != next->context.segments)
3578 +               if (unlikely(prev->context.segments != next->context.segments))
3579                         load_LDT(next);
3580  #ifdef CONFIG_SMP
3581                 cpu_tlbstate[cpu].state = TLBSTATE_OK;
3582 --- linux/include/asm-i386/hw_irq.h.orig        Wed Jan 16 21:44:01 2002
3583 +++ linux/include/asm-i386/hw_irq.h     Mon Feb  4 04:09:19 2002
3584 @@ -41,8 +41,9 @@
3585  #define ERROR_APIC_VECTOR      0xfe
3586  #define INVALIDATE_TLB_VECTOR  0xfd
3587  #define RESCHEDULE_VECTOR      0xfc
3588 -#define CALL_FUNCTION_VECTOR   0xfb
3589 -#define KDB_VECTOR             0xfa
3590 +#define TASK_MIGRATION_VECTOR  0xfb
3591 +#define CALL_FUNCTION_VECTOR   0xfa
3592 +#define KDB_VECTOR             0xf9
3593
3594  /*
3595   * Local APIC timer IRQ vector is on a different priority level,
3596 --- linux/include/asm-i386/apic.h.orig  Mon Jan 28 18:05:10 2002
3597 +++ linux/include/asm-i386/apic.h       Mon Feb  4 04:09:19 2002
3598 @@ -79,6 +79,8 @@
3599  extern void setup_apic_nmi_watchdog (void);
3600  extern inline void nmi_watchdog_tick (struct pt_regs * regs);
3601  extern int APIC_init_uniprocessor (void);
3602 +extern void disable_APIC_timer(void);
3603 +extern void enable_APIC_timer(void);
3604
3605  extern struct pm_dev *apic_pm_register(pm_dev_t, unsigned long, pm_callback);
3606  extern void apic_pm_unregister(struct pm_dev*);
3607 --- linux/net/unix/af_unix.c.orig       Sun Jan  6 13:55:58 2002
3608 +++ linux/net/unix/af_unix.c    Mon Feb  4 04:09:19 2002
3609 @@ -564,10 +564,8 @@
3610                                       addr->hash)) {
3611                 write_unlock(&unix_table_lock);
3612                 /* Sanity yield. It is unusual case, but yet... */
3613 -               if (!(ordernum&0xFF)) {
3614 -                       current->policy |= SCHED_YIELD;
3615 -                       schedule();
3616 -               }
3617 +               if (!(ordernum&0xFF))
3618 +                       yield();
3619                 goto retry;
3620         }
3621         addr->hash ^= sk->type;
3622 --- linux/net/ipv4/tcp_output.c.orig    Sun Jan  6 13:55:57 2002
3623 +++ linux/net/ipv4/tcp_output.c Mon Feb  4 04:09:19 2002
3624 @@ -1009,8 +1009,7 @@
3625                         skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
3626                         if (skb)
3627                                 break;
3628 -                       current->policy |= SCHED_YIELD;
3629 -                       schedule();
3630 +                       yield();
3631                 }
3632
3633                 /* Reserve space for headers and prepare control bits. */
3634 --- linux/net/sunrpc/sched.c.orig       Sun Jan  6 13:55:52 2002
3635 +++ linux/net/sunrpc/sched.c    Mon Feb  4 04:09:19 2002
3636 @@ -772,8 +772,7 @@
3637                 }
3638                 if (flags & RPC_TASK_ASYNC)
3639                         return NULL;
3640 -               current->policy |= SCHED_YIELD;
3641 -               schedule();
3642 +               yield();
3643         } while (!signalled());
3644
3645         return NULL;
3646 @@ -1114,8 +1113,7 @@
3647                 __rpc_schedule();
3648                 if (all_tasks) {
3649                         dprintk("rpciod_killall: waiting for tasks to exit\n");
3650 -                       current->policy |= SCHED_YIELD;
3651 -                       schedule();
3652 +                       yield();
3653                 }
3654         }
3655
3656 @@ -1185,8 +1183,7 @@
3657          * wait briefly before checking the process id.
3658          */
3659         current->sigpending = 0;
3660 -       current->policy |= SCHED_YIELD;
3661 -       schedule();
3662 +       yield();
3663         /*
3664          * Display a message if we're going to wait longer.
3665          */
3666 --- linux/net/sched/sch_generic.c.orig  Fri Aug 18 19:26:25 2000
3667 +++ linux/net/sched/sch_generic.c       Mon Feb  4 04:09:19 2002
3668 @@ -475,10 +475,8 @@
3669
3670         dev_watchdog_down(dev);
3671
3672 -       while (test_bit(__LINK_STATE_SCHED, &dev->state)) {
3673 -               current->policy |= SCHED_YIELD;
3674 -               schedule();
3675 -       }
3676 +       while (test_bit(__LINK_STATE_SCHED, &dev->state))
3677 +               yield();
3678
3679         spin_unlock_wait(&dev->xmit_lock);
3680  }
3681 --- linux/net/socket.c.orig     Sun Jan  6 13:55:58 2002
3682 +++ linux/net/socket.c  Mon Feb  4 04:09:19 2002
3683 @@ -148,8 +148,7 @@
3684         while (atomic_read(&net_family_lockct) != 0) {
3685                 spin_unlock(&net_family_lock);
3686
3687 -               current->policy |= SCHED_YIELD;
3688 -               schedule();
3689 +               yield();
3690
3691                 spin_lock(&net_family_lock);
3692         }
3693 --- linux/drivers/net/slip.c.orig       Sun Jan  6 13:55:48 2002
3694 +++ linux/drivers/net/slip.c    Mon Feb  4 04:09:19 2002
3695 @@ -1393,10 +1393,8 @@
3696                 /* First of all: check for active disciplines and hangup them.
3697                  */
3698                 do {
3699 -                       if (busy) {
3700 -                               current->counter = 0;
3701 -                               schedule();
3702 -                       }
3703 +                       if (busy)
3704 +                               sys_sched_yield();
3705
3706                         busy = 0;
3707                         local_bh_disable();
3708 --- linux/drivers/block/loop.c.orig     Sun Jan  6 13:55:56 2002
3709 +++ linux/drivers/block/loop.c  Mon Feb  4 04:09:19 2002
3710 @@ -570,9 +570,6 @@
3711         flush_signals(current);
3712         spin_unlock_irq(&current->sigmask_lock);
3713
3714 -       current->policy = SCHED_OTHER;
3715 -       current->nice = -20;
3716 -
3717         spin_lock_irq(&lo->lo_lock);
3718         lo->lo_state = Lo_bound;
3719         atomic_inc(&lo->lo_pending);
3720 --- linux/drivers/char/mwave/mwavedd.c.orig     Sun Jan 13 16:27:41 2002
3721 +++ linux/drivers/char/mwave/mwavedd.c  Mon Feb  4 04:09:19 2002
3722 @@ -279,7 +279,6 @@
3723                         pDrvData->IPCs[ipcnum].bIsHere = FALSE;
3724                         pDrvData->IPCs[ipcnum].bIsEnabled = TRUE;
3725         #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
3726 -                       current->nice = -20;    /* boost to provide priority timing */
3727         #else
3728                         current->priority = 0x28;       /* boost to provide priority timing */
3729         #endif
3730 --- linux/drivers/ide/ataraid.c.orig    Sun Jan  6 13:55:52 2002
3731 +++ linux/drivers/ide/ataraid.c Mon Feb  4 04:09:19 2002
3732 @@ -123,8 +123,7 @@
3733                 ptr=kmalloc(sizeof(struct buffer_head),GFP_NOIO);
3734                 if (!ptr) {
3735                         __set_current_state(TASK_RUNNING);
3736 -                       current->policy |= SCHED_YIELD;
3737 -                       schedule();
3738 +                       yield();
3739                 }
3740         }
3741         return ptr;
3742 @@ -139,8 +138,7 @@
3743                 ptr=kmalloc(sizeof(struct ataraid_bh_private),GFP_NOIO);
3744                 if (!ptr) {
3745                         __set_current_state(TASK_RUNNING);
3746 -                       current->policy |= SCHED_YIELD;
3747 -                       schedule();
3748 +                       yield();
3749                 }
3750         }
3751         return ptr;
3752 --- linux/drivers/md/md.c.orig  Sun Jan  6 13:55:56 2002
3753 +++ linux/drivers/md/md.c       Mon Feb  4 04:09:19 2002
3754 @@ -2930,8 +2930,6 @@
3755          * bdflush, otherwise bdflush will deadlock if there are too
3756          * many dirty RAID5 blocks.
3757          */
3758 -       current->policy = SCHED_OTHER;
3759 -       current->nice = -20;
3760         md_unlock_kernel();
3761
3762         complete(thread->event);
3763 @@ -3381,11 +3379,6 @@
3764                "(but not more than %d KB/sec) for reconstruction.\n",
3765                sysctl_speed_limit_max);
3766
3767 -       /*
3768 -        * Resync has low priority.
3769 -        */
3770 -       current->nice = 19;
3771 -
3772         is_mddev_idle(mddev); /* this also initializes IO event counters */
3773         for (m = 0; m < SYNC_MARKS; m++) {
3774                 mark[m] = jiffies;
3775 @@ -3463,16 +3456,13 @@
3776                 currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
3777
3778                 if (currspeed > sysctl_speed_limit_min) {
3779 -                       current->nice = 19;
3780 -
3781                         if ((currspeed > sysctl_speed_limit_max) ||
3782                                         !is_mddev_idle(mddev)) {
3783                                 current->state = TASK_INTERRUPTIBLE;
3784                                 md_schedule_timeout(HZ/4);
3785                                 goto repeat;
3786                         }
3787 -               } else
3788 -                       current->nice = -20;
3789 +               }
3790         }
3791         printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
3792         err = 0;
3793 --- linux/arch/i386/mm/fault.c.orig     Sun Jan  6 13:55:47 2002
3794 +++ linux/arch/i386/mm/fault.c  Mon Feb  4 04:09:19 2002
3795 @@ -88,8 +88,7 @@
3796
3797  out_of_memory:
3798         if (current->pid == 1) {
3799 -               current->policy |= SCHED_YIELD;
3800 -               schedule();
3801 +               yield();
3802                 goto survive;
3803         }
3804         goto bad_area;
3805 @@ -344,8 +343,7 @@
3806  out_of_memory:
3807         up_read(&mm->mmap_sem);
3808         if (tsk->pid == 1) {
3809 -               tsk->policy |= SCHED_YIELD;
3810 -               schedule();
3811 +               yield();
3812                 down_read(&mm->mmap_sem);
3813                 goto survive;
3814         }
3815 --- linux/arch/i386/kernel/smpboot.c.orig       Sun Jan  6 13:55:56 2002
3816 +++ linux/arch/i386/kernel/smpboot.c    Mon Feb  4 04:09:19 2002
3817 @@ -308,14 +308,14 @@
3818                         if (tsc_values[i] < avg)
3819                                 realdelta = -realdelta;
3820
3821 -                       printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n",
3822 -                               i, realdelta);
3823 +                       printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", i, realdelta);
3824                 }
3825
3826                 sum += delta;
3827         }
3828         if (!buggy)
3829                 printk("passed.\n");
3830 +               ;
3831  }
3832
3833  static void __init synchronize_tsc_ap (void)
3834 @@ -365,7 +365,7 @@
3835          * (This works even if the APIC is not enabled.)
3836          */
3837         phys_id = GET_APIC_ID(apic_read(APIC_ID));
3838 -       cpuid = current->processor;
3839 +       cpuid = cpu();
3840         if (test_and_set_bit(cpuid, &cpu_online_map)) {
3841                 printk("huh, phys CPU#%d, CPU#%d already present??\n",
3842                                         phys_id, cpuid);
3843 @@ -435,6 +435,7 @@
3844          */
3845         smp_store_cpu_info(cpuid);
3846
3847 +       disable_APIC_timer();
3848         /*
3849          * Allow the master to continue.
3850          */
3851 @@ -465,6 +466,7 @@
3852         smp_callin();
3853         while (!atomic_read(&smp_commenced))
3854                 rep_nop();
3855 +       enable_APIC_timer();
3856         /*
3857          * low-memory mappings have been cleared, flush them from
3858          * the local TLBs too.
3859 @@ -803,16 +805,13 @@
3860         if (!idle)
3861                 panic("No idle process for CPU %d", cpu);
3862
3863 -       idle->processor = cpu;
3864 -       idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */
3865 +       init_idle(idle, cpu);
3866
3867         map_cpu_to_boot_apicid(cpu, apicid);
3868
3869         idle->thread.eip = (unsigned long) start_secondary;
3870
3871 -       del_from_runqueue(idle);
3872         unhash_process(idle);
3873 -       init_tasks[cpu] = idle;
3874
3875         /* start_eip had better be page-aligned! */
3876         start_eip = setup_trampoline();
3877 @@ -925,6 +924,7 @@
3878  }
3879
3880  cycles_t cacheflush_time;
3881 +unsigned long cache_decay_ticks;
3882
3883  static void smp_tune_scheduling (void)
3884  {
3885 @@ -958,9 +958,13 @@
3886                 cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth;
3887         }
3888
3889 +       cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000;
3890 +
3891         printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
3892                 (long)cacheflush_time/(cpu_khz/1000),
3893                 ((long)cacheflush_time*100/(cpu_khz/1000)) % 100);
3894 +       printk("task migration cache decay timeout: %ld msecs.\n",
3895 +               (cache_decay_ticks + 1) * 1000 / HZ);
3896  }
3897
3898  /*
3899 @@ -1020,8 +1024,7 @@
3900         map_cpu_to_boot_apicid(0, boot_cpu_apicid);
3901
3902         global_irq_holder = 0;
3903 -       current->processor = 0;
3904 -       init_idle();
3905 +       current->cpu = 0;
3906         smp_tune_scheduling();
3907
3908         /*
3909 --- linux/arch/i386/kernel/process.c.orig       Mon Jan 28 18:09:58 2002
3910 +++ linux/arch/i386/kernel/process.c    Mon Feb  4 04:09:19 2002
3911 @@ -123,15 +123,12 @@
3912  void cpu_idle (void)
3913  {
3914         /* endless idle loop with no priority at all */
3915 -       init_idle();
3916 -       current->nice = 20;
3917 -       current->counter = -100;
3918
3919         while (1) {
3920                 void (*idle)(void) = pm_idle;
3921                 if (!idle)
3922                         idle = default_idle;
3923 -               while (!current->need_resched)
3924 +               if (!current->need_resched)
3925                         idle();
3926                 schedule();
3927                 check_pgt_cache();
3928 @@ -694,15 +691,17 @@
3929         asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
3930
3931         /*
3932 -        * Restore %fs and %gs.
3933 +        * Restore %fs and %gs if needed.
3934          */
3935 -       loadsegment(fs, next->fs);
3936 -       loadsegment(gs, next->gs);
3937 +       if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) {
3938 +               loadsegment(fs, next->fs);
3939 +               loadsegment(gs, next->gs);
3940 +       }
3941
3942         /*
3943          * Now maybe reload the debug registers
3944          */
3945 -       if (next->debugreg[7]){
3946 +       if (unlikely(next->debugreg[7])) {
3947                 loaddebug(next, 0);
3948                 loaddebug(next, 1);
3949                 loaddebug(next, 2);
3950 @@ -712,7 +711,7 @@
3951                 loaddebug(next, 7);
3952         }
3953
3954 -       if (prev->ioperm || next->ioperm) {
3955 +       if (unlikely(prev->ioperm || next->ioperm)) {
3956                 if (next->ioperm) {
3957                         /*
3958                          * 4 cachelines copy ... not good, but not that
3959 --- linux/arch/i386/kernel/apic.c.orig  Sun Jan  6 13:55:54 2002
3960 +++ linux/arch/i386/kernel/apic.c       Mon Feb  4 04:09:19 2002
3961 @@ -785,8 +785,7 @@
3962          */
3963
3964         slice = clocks / (smp_num_cpus+1);
3965 -       printk("cpu: %d, clocks: %d, slice: %d\n",
3966 -               smp_processor_id(), clocks, slice);
3967 +       printk("cpu: %d, clocks: %d, slice: %d\n", smp_processor_id(), clocks, slice);
3968
3969         /*
3970          * Wait for IRQ0's slice:
3971 @@ -809,8 +808,7 @@
3972
3973         __setup_APIC_LVTT(clocks);
3974
3975 -       printk("CPU%d<T0:%d,T1:%d,D:%d,S:%d,C:%d>\n",
3976 -                       smp_processor_id(), t0, t1, delta, slice, clocks);
3977 +       printk("CPU%d<T0:%d,T1:%d,D:%d,S:%d,C:%d>\n", smp_processor_id(), t0, t1, delta, slice, clocks);
3978
3979         __restore_flags(flags);
3980  }
3981 @@ -911,6 +909,26 @@
3982
3983         /* and update all other cpus */
3984         smp_call_function(setup_APIC_timer, (void *)calibration_result, 1, 1);
3985 +}
3986 +
3987 +void __init disable_APIC_timer(void)
3988 +{
3989 +       if (using_apic_timer) {
3990 +               unsigned long v;
3991 +
3992 +               v = apic_read(APIC_LVTT);
3993 +               apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
3994 +       }
3995 +}
3996 +
3997 +void enable_APIC_timer(void)
3998 +{
3999 +       if (using_apic_timer) {
4000 +               unsigned long v;
4001 +
4002 +               v = apic_read(APIC_LVTT);
4003 +               apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
4004 +       }
4005  }
4006
4007  /*
4008 --- linux/arch/i386/kernel/nmi.c.orig   Sun Jan  6 13:55:43 2002
4009 +++ linux/arch/i386/kernel/nmi.c        Mon Feb  4 04:09:19 2002
4010 @@ -283,7 +283,7 @@
4011                          * to get a message out.
4012                          */
4013                         bust_spinlocks(1);
4014 -                       printk("NMI Watchdog detected LOCKUP on CPU%d, registers:\n", cpu);
4015 +                       printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip);
4016                         show_registers(regs);
4017                         printk("console shuts up ...\n");
4018                         console_silent();
4019 --- linux/arch/i386/kernel/smp.c.orig   Sun Jan  6 13:55:56 2002
4020 +++ linux/arch/i386/kernel/smp.c        Mon Feb  4 04:09:19 2002
4021 @@ -105,7 +105,7 @@
4022  /* The 'big kernel lock' */
4023  spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
4024
4025 -struct tlb_state cpu_tlbstate[NR_CPUS] = {[0 ... NR_CPUS-1] = { &init_mm, 0 }};
4026 +struct tlb_state cpu_tlbstate[NR_CPUS] __cacheline_aligned = {[0 ... NR_CPUS-1] = { &init_mm, 0, }};
4027
4028  /*
4029   * the following functions deal with sending IPIs between CPUs.
4030 @@ -485,15 +485,54 @@
4031         do_flush_tlb_all_local();
4032  }
4033
4034 +static spinlock_t migration_lock = SPIN_LOCK_UNLOCKED;
4035 +static task_t *new_task;
4036 +
4037 +/*
4038 + * This function sends a 'task migration' IPI to another CPU.
4039 + * Must be called from syscall contexts, with interrupts *enabled*.
4040 + */
4041 +void smp_migrate_task(int cpu, task_t *p)
4042 +{
4043 +       /*
4044 +        * The target CPU will unlock the migration spinlock:
4045 +        */
4046 +       spin_lock(&migration_lock);
4047 +       new_task = p;
4048 +       send_IPI_mask(1 << cpu, TASK_MIGRATION_VECTOR);
4049 +}
4050 +
4051 +/*
4052 + * Task migration callback.
4053 + */
4054 +asmlinkage void smp_task_migration_interrupt(void)
4055 +{
4056 +       task_t *p;
4057 +
4058 +       ack_APIC_irq();
4059 +       p = new_task;
4060 +       spin_unlock(&migration_lock);
4061 +       sched_task_migrated(p);
4062 +}
4063  /*
4064   * this function sends a 'reschedule' IPI to another CPU.
4065   * it goes straight through and wastes no time serializing
4066   * anything. Worst case is that we lose a reschedule ...
4067   */
4068 -
4069  void smp_send_reschedule(int cpu)
4070  {
4071         send_IPI_mask(1 << cpu, RESCHEDULE_VECTOR);
4072 +}
4073 +
4074 +/*
4075 + * this function sends a reschedule IPI to all (other) CPUs.
4076 + * This should only be used if some 'global' task became runnable,
4077 + * such as a RT task, that must be handled now. The first CPU
4078 + * that manages to grab the task will run it.
4079 + */
4080 +void smp_send_reschedule_all(void)
4081 +{
4082 +       send_IPI_allbutself(RESCHEDULE_VECTOR);
4083  }
4084
4085  /*
4086 --- linux/arch/i386/kernel/i8259.c.orig Wed Jan 16 21:43:09 2002
4087 +++ linux/arch/i386/kernel/i8259.c      Mon Feb  4 04:09:19 2002
4088 @@ -79,6 +79,7 @@
4089   * through the ICC by us (IPIs)
4090   */
4091  #ifdef CONFIG_SMP
4092 +BUILD_SMP_INTERRUPT(task_migration_interrupt,TASK_MIGRATION_VECTOR)
4093  BUILD_SMP_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
4094  BUILD_SMP_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
4095  BUILD_SMP_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
4096 @@ -472,6 +473,9 @@
4097          * IPI, driven by wakeup.
4098          */
4099         set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
4100 +
4101 +       /* IPI for task migration */
4102 +       set_intr_gate(TASK_MIGRATION_VECTOR, task_migration_interrupt);
4103
4104         /* IPI for invalidation */
4105         set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
4106 --- linux/arch/i386/kernel/entry.S.orig Fri Jan 25 14:30:36 2002
4107 +++ linux/arch/i386/kernel/entry.S      Mon Feb  4 04:09:19 2002
4108 @@ -77,7 +77,7 @@
4109  exec_domain    = 16
4110  need_resched   = 20
4111  tsk_ptrace     = 24
4112 -processor      = 52
4113 +cpu            = 32
4114
4115  ENOSYS = 38
4116
4117 @@ -176,9 +176,11 @@
4118
4119
4120  ENTRY(ret_from_fork)
4121 +#if CONFIG_SMP
4122         pushl %ebx
4123         call SYMBOL_NAME(schedule_tail)
4124         addl $4, %esp
4125 +#endif
4126         GET_CURRENT(%ebx)
4127         testb $0x02,tsk_ptrace(%ebx)    # PT_TRACESYS
4128         jne tracesys_exit
4129 --- linux/arch/i386/kernel/setup.c.orig Mon Jan 28 18:10:23 2002
4130 +++ linux/arch/i386/kernel/setup.c      Mon Feb  4 04:09:19 2002
4131 @@ -2922,9 +2922,10 @@
4132         load_TR(nr);
4133         load_LDT(&init_mm);
4134
4135 -       /*
4136 -        * Clear all 6 debug registers:
4137 -        */
4138 +       /* Clear %fs and %gs. */
4139 +       asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
4140 +
4141 +       /* Clear all 6 debug registers: */
4142
4143  #define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) );
4144