]> git.pld-linux.org Git - packages/kernel.git/blame - sched-O1-2.4.17-J9.patch
- added description of djurban's branch
[packages/kernel.git] / sched-O1-2.4.17-J9.patch
CommitLineData
d48ecf2a
JR
1--- linux/fs/proc/proc_misc.c.orig Sun Jan 6 13:55:55 2002
2+++ linux/fs/proc/proc_misc.c Sun Jan 6 13:56:25 2002
3@@ -85,11 +85,11 @@
4 a = avenrun[0] + (FIXED_1/200);
5 b = avenrun[1] + (FIXED_1/200);
6 c = avenrun[2] + (FIXED_1/200);
7- len = sprintf(page,"%d.%02d %d.%02d %d.%02d %d/%d %d\n",
8+ len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
9 LOAD_INT(a), LOAD_FRAC(a),
10 LOAD_INT(b), LOAD_FRAC(b),
11 LOAD_INT(c), LOAD_FRAC(c),
12- nr_running, nr_threads, last_pid);
13+ nr_running(), nr_threads, last_pid);
14 return proc_calc_metrics(page, start, off, count, eof, len);
15 }
16
17@@ -101,7 +101,7 @@
18 int len;
19
20 uptime = jiffies;
21- idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;
22+ idle = init_task.times.tms_utime + init_task.times.tms_stime;
23
24 /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
25 that would overflow about every five days at HZ == 100.
26@@ -303,10 +303,10 @@
27 }
28
29 len += sprintf(page + len,
30- "\nctxt %u\n"
31+ "\nctxt %lu\n"
32 "btime %lu\n"
33 "processes %lu\n",
34- kstat.context_swtch,
35+ nr_context_switches(),
36 xtime.tv_sec - jif / HZ,
37 total_forks);
38
39--- linux/fs/proc/array.c.orig Sun Jan 6 13:55:51 2002
40+++ linux/fs/proc/array.c Mon Jan 7 20:01:05 2002
41@@ -335,9 +335,12 @@
42
43 /* scale priority and nice values from timeslices to -20..20 */
44 /* to make it look like a "normal" Unix priority/nice value */
45- priority = task->counter;
46- priority = 20 - (priority * 10 + DEF_COUNTER / 2) / DEF_COUNTER;
47- nice = task->nice;
48+ priority = task->prio;
49+ if (priority >= MAX_RT_PRIO)
50+ priority -= MAX_RT_PRIO;
51+ else
52+ priority = priority-100;
53+ nice = task->__nice;
54
55 read_lock(&tasklist_lock);
56 ppid = task->pid ? task->p_opptr->pid : 0;
57@@ -387,7 +390,7 @@
58 task->nswap,
59 task->cnswap,
60 task->exit_signal,
61- task->processor);
62+ task->cpu);
63 if(mm)
64 mmput(mm);
65 return res;
66--- linux/fs/nfs/pagelist.c.orig Sun Jan 6 13:55:57 2002
67+++ linux/fs/nfs/pagelist.c Sun Jan 6 13:56:25 2002
68@@ -96,8 +96,7 @@
69 continue;
70 if (signalled() && (server->flags & NFS_MOUNT_INTR))
71 return ERR_PTR(-ERESTARTSYS);
72- current->policy = SCHED_YIELD;
73- schedule();
74+ yield();
75 }
76
77 /* Initialize the request struct. Initially, we assume a
78--- linux/fs/ufs/truncate.c.orig Sun Jan 6 13:55:55 2002
79+++ linux/fs/ufs/truncate.c Sun Jan 6 13:56:25 2002
80@@ -448,10 +448,7 @@
81 if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
82 ufs_sync_inode (inode);
83 run_task_queue(&tq_disk);
84- current->policy |= SCHED_YIELD;
85- schedule ();
86-
87-
88+ yield();
89 }
90 offset = inode->i_size & uspi->s_fshift;
91 if (offset) {
92--- linux/fs/reiserfs/buffer2.c.orig Sun Jan 6 13:55:57 2002
93+++ linux/fs/reiserfs/buffer2.c Sun Jan 6 13:56:25 2002
94@@ -33,8 +33,7 @@
95 buffer_journal_dirty(bh) ? ' ' : '!');
96 }
97 run_task_queue(&tq_disk);
98- current->policy |= SCHED_YIELD;
99- schedule();
100+ yield();
101 }
102 if (repeat_counter > 30000000) {
103 reiserfs_warning("vs-3051: done waiting, ignore vs-3050 messages for (%b)\n", bh) ;
104@@ -52,11 +51,11 @@
105 struct buffer_head * reiserfs_bread (struct super_block *super, int n_block, int n_size)
106 {
107 struct buffer_head *result;
108- PROC_EXP( unsigned int ctx_switches = kstat.context_swtch );
109+ PROC_EXP( unsigned int ctx_switches = nr_context_switches(); );
110
111 result = bread (super -> s_dev, n_block, n_size);
112 PROC_INFO_INC( super, breads );
113- PROC_EXP( if( kstat.context_swtch != ctx_switches )
114+ PROC_EXP( if( nr_context_switches() != ctx_switches )
115 PROC_INFO_INC( super, bread_miss ) );
116 return result;
117 }
118--- linux/fs/reiserfs/journal.c.orig Sun Jan 6 13:55:57 2002
119+++ linux/fs/reiserfs/journal.c Sun Jan 6 13:56:25 2002
120@@ -149,8 +149,7 @@
121 }
122 bn = allocate_bitmap_node(p_s_sb) ;
123 if (!bn) {
124- current->policy |= SCHED_YIELD ;
125- schedule() ;
126+ yield();
127 goto repeat ;
128 }
129 return bn ;
130--- linux/fs/jffs2/background.c.orig Sun Jan 6 13:55:53 2002
131+++ linux/fs/jffs2/background.c Sun Jan 6 13:56:25 2002
132@@ -106,9 +106,6 @@
133
134 sprintf(current->comm, "jffs2_gcd_mtd%d", c->mtd->index);
135
136- /* FIXME in the 2.2 backport */
137- current->nice = 10;
138-
139 for (;;) {
140 spin_lock_irq(&current->sigmask_lock);
141 siginitsetinv (&current->blocked, sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT));
142--- linux/fs/jbd/journal.c.orig Sun Jan 6 13:55:57 2002
143+++ linux/fs/jbd/journal.c Sun Jan 6 13:56:25 2002
144@@ -460,8 +460,7 @@
145 printk (KERN_NOTICE __FUNCTION__
146 ": ENOMEM at get_unused_buffer_head, "
147 "trying again.\n");
148- current->policy |= SCHED_YIELD;
149- schedule();
150+ yield();
151 }
152 } while (!new_bh);
153 /* keep subsequent assertions sane */
154@@ -1539,8 +1538,7 @@
155 last_warning = jiffies;
156 }
157
158- current->policy |= SCHED_YIELD;
159- schedule();
160+ yield();
161 }
162 }
163
164@@ -1598,8 +1596,7 @@
165 last_warning = jiffies;
166 }
167 while (ret == 0) {
168- current->policy |= SCHED_YIELD;
169- schedule();
170+ yield();
171 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
172 }
173 }
174--- linux/fs/jbd/revoke.c.orig Sun Jan 6 13:55:57 2002
175+++ linux/fs/jbd/revoke.c Sun Jan 6 13:56:25 2002
176@@ -137,8 +137,7 @@
177 if (!journal_oom_retry)
178 return -ENOMEM;
179 jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n");
180- current->policy |= SCHED_YIELD;
181- schedule();
182+ yield();
183 goto repeat;
184 }
185
186--- linux/fs/jbd/transaction.c.orig Sun Jan 6 13:55:57 2002
187+++ linux/fs/jbd/transaction.c Sun Jan 6 13:56:25 2002
188@@ -1377,8 +1377,7 @@
189 do {
190 old_handle_count = transaction->t_handle_count;
191 set_current_state(TASK_RUNNING);
192- current->policy |= SCHED_YIELD;
193- schedule();
194+ yield();
195 } while (old_handle_count != transaction->t_handle_count);
196 }
197
198--- linux/fs/binfmt_elf.c.orig Sun Jan 6 13:55:57 2002
199+++ linux/fs/binfmt_elf.c Sun Jan 6 13:56:25 2002
200@@ -1143,7 +1143,7 @@
201 psinfo.pr_state = i;
202 psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i];
203 psinfo.pr_zomb = psinfo.pr_sname == 'Z';
204- psinfo.pr_nice = current->nice;
205+ psinfo.pr_nice = current->__nice;
206 psinfo.pr_flag = current->flags;
207 psinfo.pr_uid = NEW_TO_OLD_UID(current->uid);
208 psinfo.pr_gid = NEW_TO_OLD_GID(current->gid);
209--- linux/fs/buffer.c.orig Sun Jan 6 13:55:57 2002
210+++ linux/fs/buffer.c Fri Jan 25 14:25:56 2002
211@@ -725,9 +725,8 @@
212 wakeup_bdflush();
213 try_to_free_pages(zone, GFP_NOFS, 0);
214 run_task_queue(&tq_disk);
215- current->policy |= SCHED_YIELD;
216 __set_current_state(TASK_RUNNING);
217- schedule();
218+ sys_sched_yield();
219 }
220
221 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
222--- linux/fs/locks.c.orig Sun Jan 6 13:55:51 2002
223+++ linux/fs/locks.c Sun Jan 6 13:56:25 2002
224@@ -445,8 +445,7 @@
225 /* Let the blocked process remove waiter from the
226 * block list when it gets scheduled.
227 */
228- current->policy |= SCHED_YIELD;
229- schedule();
230+ yield();
231 } else {
232 /* Remove waiter from the block list, because by the
233 * time it wakes up blocker won't exist any more.
234--- linux/init/main.c.orig Sun Jan 6 13:55:57 2002
235+++ linux/init/main.c Mon Jan 28 18:12:51 2002
236@@ -482,8 +482,6 @@
237 extern void setup_arch(char **);
238 extern void cpu_idle(void);
239
240-unsigned long wait_init_idle;
241-
242 #ifndef CONFIG_SMP
243
244 #ifdef CONFIG_X86_LOCAL_APIC
245@@ -492,34 +490,24 @@
246 APIC_init_uniprocessor();
247 }
248 #else
249-#define smp_init() do { } while (0)
250+#define smp_init() do { } while (0)
251 #endif
252
253 #else
254
255-
256 /* Called by boot processor to activate the rest. */
257 static void __init smp_init(void)
258 {
259 /* Get other processors into their bootup holding patterns. */
260 smp_boot_cpus();
261- wait_init_idle = cpu_online_map;
262- clear_bit(current->processor, &wait_init_idle); /* Don't wait on me! */
263
264 smp_threads_ready=1;
265 smp_commence();
266-
267- /* Wait for the other cpus to set up their idle processes */
268- printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle);
269- while (wait_init_idle) {
270- cpu_relax();
271- barrier();
272- }
273- printk("All processors have done init_idle\n");
274 }
275
276 #endif
277
278+
279 /*
280 * We need to finalize in a non-__init function or else race conditions
281 * between the root thread and the init thread may cause start_kernel to
282@@ -531,9 +519,8 @@
283 {
284 kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
285 unlock_kernel();
286- current->need_resched = 1;
287- cpu_idle();
288-}
289+ cpu_idle();
290+}
291
292 /*
293 * Activate the first processor.
294@@ -611,14 +598,18 @@
295 ipc_init();
296 #endif
297 check_bugs();
298+
299 printk("POSIX conformance testing by UNIFIX\n");
300
301- /*
302- * We count on the initial thread going ok
303- * Like idlers init is an unlocked kernel thread, which will
304- * make syscalls (and thus be locked).
305+ init_idle(current, smp_processor_id());
306+ /*
307+ * We count on the initial thread going ok
308+ * Like idlers init is an unlocked kernel thread, which will
309+ * make syscalls (and thus be locked).
310 */
311 smp_init();
312+
313+ /* Do the rest non-__init'ed, we're now alive */
314 rest_init();
315 }
316
317@@ -779,12 +770,9 @@
318 int i, pid;
319
320 pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD);
321- if (pid > 0) {
322- while (pid != wait(&i)) {
323- current->policy |= SCHED_YIELD;
324- schedule();
325- }
326- }
327+ if (pid > 0)
328+ while (pid != wait(&i))
329+ yield();
330 if (MAJOR(real_root_dev) != RAMDISK_MAJOR
331 || MINOR(real_root_dev) != 0) {
332 error = change_root(real_root_dev,"/initrd");
333--- linux/kernel/sched.c.orig Sun Jan 6 13:55:57 2002
334+++ linux/kernel/sched.c Mon Jan 28 18:41:54 2002
335@@ -12,333 +12,249 @@
336 * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar
337 */
338
339-/*
340- * 'sched.c' is the main kernel file. It contains scheduling primitives
341- * (sleep_on, wakeup, schedule etc) as well as a number of simple system
342- * call functions (type getpid()), which just extract a field from
343- * current-task
344- */
345-
346-#include <linux/config.h>
347 #include <linux/mm.h>
348+#include <linux/nmi.h>
349 #include <linux/init.h>
350+#include <asm/uaccess.h>
351 #include <linux/smp_lock.h>
352-#include <linux/nmi.h>
353 #include <linux/interrupt.h>
354-#include <linux/kernel_stat.h>
355-#include <linux/completion.h>
356-#include <linux/prefetch.h>
357-#include <linux/compiler.h>
358-
359-#include <asm/uaccess.h>
360 #include <asm/mmu_context.h>
361
362-extern void timer_bh(void);
363-extern void tqueue_bh(void);
364-extern void immediate_bh(void);
365-
366-/*
367- * scheduler variables
368- */
369+#define BITMAP_SIZE ((((MAX_PRIO+7)/8)+sizeof(long)-1)/sizeof(long))
370
371-unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
372+typedef struct runqueue runqueue_t;
373
374-extern void mem_use(void);
375+struct prio_array {
376+ int nr_active;
377+ spinlock_t *lock;
378+ runqueue_t *rq;
379+ unsigned long bitmap[BITMAP_SIZE];
380+ list_t queue[MAX_PRIO];
381+};
382
383 /*
384- * Scheduling quanta.
385+ * This is the main, per-CPU runqueue data structure.
386 *
387- * NOTE! The unix "nice" value influences how long a process
388- * gets. The nice value ranges from -20 to +19, where a -20
389- * is a "high-priority" task, and a "+10" is a low-priority
390- * task.
391- *
392- * We want the time-slice to be around 50ms or so, so this
393- * calculation depends on the value of HZ.
394+ * Locking rule: those places that want to lock multiple runqueues
395+ * (such as the load balancing or the process migration code), lock
396+ * acquire operations must be ordered by ascending &runqueue.
397 */
398-#if HZ < 200
399-#define TICK_SCALE(x) ((x) >> 2)
400-#elif HZ < 400
401-#define TICK_SCALE(x) ((x) >> 1)
402-#elif HZ < 800
403-#define TICK_SCALE(x) (x)
404-#elif HZ < 1600
405-#define TICK_SCALE(x) ((x) << 1)
406-#else
407-#define TICK_SCALE(x) ((x) << 2)
408-#endif
409+struct runqueue {
410+ spinlock_t lock;
411+ unsigned long nr_running, nr_switches, expired_timestamp;
412+ task_t *curr, *idle;
413+ prio_array_t *active, *expired, arrays[2];
414+ int prev_nr_running[NR_CPUS];
415+} ____cacheline_aligned;
416
417-#define NICE_TO_TICKS(nice) (TICK_SCALE(20-(nice))+1)
418+static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
419
420+#define cpu_rq(cpu) (runqueues + (cpu))
421+#define this_rq() cpu_rq(smp_processor_id())
422+#define task_rq(p) cpu_rq((p)->cpu)
423+#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
424+#define rt_task(p) ((p)->policy != SCHED_OTHER)
425+
426+
427+static inline runqueue_t *lock_task_rq(task_t *p, unsigned long *flags)
428+{
429+ struct runqueue *__rq;
430+
431+repeat_lock_task:
432+ __rq = task_rq(p);
433+ spin_lock_irqsave(&__rq->lock, *flags);
434+ if (unlikely(__rq != task_rq(p))) {
435+ spin_unlock_irqrestore(&__rq->lock, *flags);
436+ goto repeat_lock_task;
437+ }
438+ return __rq;
439+}
440
441+static inline void unlock_task_rq(runqueue_t *rq, unsigned long *flags)
442+{
443+ spin_unlock_irqrestore(&rq->lock, *flags);
444+}
445 /*
446- * Init task must be ok at boot for the ix86 as we will check its signals
447- * via the SMP irq return path.
448+ * Adding/removing a task to/from a priority array:
449 */
450-
451-struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
452+static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
453+{
454+ array->nr_active--;
455+ list_del_init(&p->run_list);
456+ if (list_empty(array->queue + p->prio))
457+ __clear_bit(p->prio, array->bitmap);
458+}
459+
460+static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
461+{
462+ list_add_tail(&p->run_list, array->queue + p->prio);
463+ __set_bit(p->prio, array->bitmap);
464+ array->nr_active++;
465+ p->array = array;
466+}
467
468 /*
469- * The tasklist_lock protects the linked list of processes.
470+ * A task is 'heavily interactive' if it either has reached the
471+ * bottom 25% of the SCHED_OTHER priority range, or if it is below
472+ * its default priority by at least 3 priority levels. In this
473+ * case we favor it by reinserting it on the active array,
474+ * even after it expired its current timeslice.
475 *
476- * The runqueue_lock locks the parts that actually access
477- * and change the run-queues, and have to be interrupt-safe.
478+ * A task is a 'CPU hog' if it's either in the upper 25% of the
479+ * SCHED_OTHER priority range, or if's not an interactive task.
480 *
481- * If both locks are to be concurrently held, the runqueue_lock
482- * nests inside the tasklist_lock.
483+ * A task can get a priority bonus by being 'somewhat
484+ * interactive' - and it will get a priority penalty for
485+ * being a CPU hog.
486 *
487- * task->alloc_lock nests inside tasklist_lock.
488- */
489-spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */
490-rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */
491-
492-static LIST_HEAD(runqueue_head);
493-
494-/*
495- * We align per-CPU scheduling data on cacheline boundaries,
496- * to prevent cacheline ping-pong.
497 */
498-static union {
499- struct schedule_data {
500- struct task_struct * curr;
501- cycles_t last_schedule;
502- } schedule_data;
503- char __pad [SMP_CACHE_BYTES];
504-} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
505-
506-#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
507-#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
508-
509-struct kernel_stat kstat;
510-extern struct task_struct *child_reaper;
511-
512-#ifdef CONFIG_SMP
513-
514-#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
515-#define can_schedule(p,cpu) \
516- ((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu))
517-
518-#else
519-
520-#define idle_task(cpu) (&init_task)
521-#define can_schedule(p,cpu) (1)
522-
523-#endif
524
525-void scheduling_functions_start_here(void) { }
526+#define PRIO_INTERACTIVE \
527+ (MAX_RT_PRIO + MAX_USER_PRIO*PRIO_INTERACTIVE_RATIO/100)
528+#define PRIO_CPU_HOG \
529+ (MAX_RT_PRIO + MAX_USER_PRIO*PRIO_CPU_HOG_RATIO/100)
530+
531+#define TASK_INTERACTIVE(p) \
532+ (((p)->prio <= PRIO_INTERACTIVE) || \
533+ (((p)->prio < PRIO_CPU_HOG) && \
534+ ((p)->prio <= NICE_TO_PRIO((p)->__nice) - INTERACTIVE_DELTA)))
535
536 /*
537- * This is the function that decides how desirable a process is..
538- * You can weigh different processes against each other depending
539- * on what CPU they've run on lately etc to try to handle cache
540- * and TLB miss penalties.
541+ * We place interactive tasks back into the active array, if possible.
542 *
543- * Return values:
544- * -1000: never select this
545- * 0: out of time, recalculate counters (but it might still be
546- * selected)
547- * +ve: "goodness" value (the larger, the better)
548- * +1000: realtime process, select this.
549+ * To guarantee that this does not starve expired tasks we ignore the
550+ * interactivity of a task if the first expired task had to wait more
551+ * than a 'reasonable' amount of time. This deadline timeout is
552+ * load-dependent, as the frequency of array switched decreases with
553+ * increasing number of running tasks:
554 */
555+#define EXPIRED_STARVING(rq) \
556+ ((rq)->expired_timestamp && \
557+ (jiffies - (rq)->expired_timestamp >= \
558+ STARVATION_LIMIT * ((rq)->nr_running) + 1))
559
560-static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
561+static inline int effective_prio(task_t *p)
562 {
563- int weight;
564+ int bonus, prio;
565
566 /*
567- * select the current process after every other
568- * runnable process, but before the idle thread.
569- * Also, dont trigger a counter recalculation.
570+ * Here we scale the actual sleep average [0 .... MAX_SLEEP_AVG]
571+ * into the -14 ... +14 bonus/penalty range.
572+ *
573+ * We use 70% of the full 0...39 priority range so that:
574+ *
575+ * 1) nice +19 CPU hogs do not preempt nice 0 CPU hogs.
576+ * 2) nice -20 interactive tasks do not get preempted by
577+ * nice 0 interactive tasks.
578+ *
579+ * Both properties are important to certain workloads.
580 */
581- weight = -1;
582- if (p->policy & SCHED_YIELD)
583- goto out;
584+ bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
585+ MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
586
587- /*
588- * Non-RT process - normal case first.
589- */
590- if (p->policy == SCHED_OTHER) {
591+ prio = NICE_TO_PRIO(p->__nice) - bonus;
592+ if (prio < MAX_RT_PRIO)
593+ prio = MAX_RT_PRIO;
594+ if (prio > MAX_PRIO-1)
595+ prio = MAX_PRIO-1;
596+ return prio;
597+}
598+
599+static inline void activate_task(task_t *p, runqueue_t *rq)
600+{
601+ unsigned long sleep_time = jiffies - p->sleep_timestamp;
602+ prio_array_t *array = rq->active;
603+
604+ if (!rt_task(p) && sleep_time) {
605 /*
606- * Give the process a first-approximation goodness value
607- * according to the number of clock-ticks it has left.
608- *
609- * Don't do any other calculations if the time slice is
610- * over..
611+ * This code gives a bonus to interactive tasks. We update
612+ * an 'average sleep time' value here, based on
613+ * sleep_timestamp. The more time a task spends sleeping,
614+ * the higher the average gets - and the higher the priority
615+ * boost gets as well.
616 */
617- weight = p->counter;
618- if (!weight)
619- goto out;
620-
621-#ifdef CONFIG_SMP
622- /* Give a largish advantage to the same processor... */
623- /* (this is equivalent to penalizing other processors) */
624- if (p->processor == this_cpu)
625- weight += PROC_CHANGE_PENALTY;
626-#endif
627-
628- /* .. and a slight advantage to the current MM */
629- if (p->mm == this_mm || !p->mm)
630- weight += 1;
631- weight += 20 - p->nice;
632- goto out;
633+ p->sleep_avg += sleep_time;
634+ if (p->sleep_avg > MAX_SLEEP_AVG)
635+ p->sleep_avg = MAX_SLEEP_AVG;
636+ p->prio = effective_prio(p);
637 }
638+ enqueue_task(p, array);
639+ rq->nr_running++;
640+}
641
642- /*
643- * Realtime process, select the first one on the
644- * runqueue (taking priorities within processes
645- * into account).
646- */
647- weight = 1000 + p->rt_priority;
648-out:
649- return weight;
650+static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
651+{
652+ rq->nr_running--;
653+ dequeue_task(p, p->array);
654+ p->array = NULL;
655+ p->sleep_timestamp = jiffies;
656 }
657
658-/*
659- * the 'goodness value' of replacing a process on a given CPU.
660- * positive value means 'replace', zero or negative means 'dont'.
661- */
662-static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
663+static inline void resched_task(task_t *p)
664 {
665- return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
666+ int need_resched;
667+
668+ need_resched = p->need_resched;
669+ wmb();
670+ p->need_resched = 1;
671+ if (!need_resched && (p->cpu != smp_processor_id()))
672+ smp_send_reschedule(p->cpu);
673 }
674
675+#ifdef CONFIG_SMP
676+
677 /*
678- * This is ugly, but reschedule_idle() is very timing-critical.
679- * We are called with the runqueue spinlock held and we must
680- * not claim the tasklist_lock.
681+ * Wait for a process to unschedule. This is used by the exit() and
682+ * ptrace() code.
683 */
684-static FASTCALL(void reschedule_idle(struct task_struct * p));
685-
686-static void reschedule_idle(struct task_struct * p)
687+void wait_task_inactive(task_t * p)
688 {
689-#ifdef CONFIG_SMP
690- int this_cpu = smp_processor_id();
691- struct task_struct *tsk, *target_tsk;
692- int cpu, best_cpu, i, max_prio;
693- cycles_t oldest_idle;
694+ unsigned long flags;
695+ runqueue_t *rq;
696
697- /*
698- * shortcut if the woken up task's last CPU is
699- * idle now.
700- */
701- best_cpu = p->processor;
702- if (can_schedule(p, best_cpu)) {
703- tsk = idle_task(best_cpu);
704- if (cpu_curr(best_cpu) == tsk) {
705- int need_resched;
706-send_now_idle:
707- /*
708- * If need_resched == -1 then we can skip sending
709- * the IPI altogether, tsk->need_resched is
710- * actively watched by the idle thread.
711- */
712- need_resched = tsk->need_resched;
713- tsk->need_resched = 1;
714- if ((best_cpu != this_cpu) && !need_resched)
715- smp_send_reschedule(best_cpu);
716- return;
717- }
718+repeat:
719+ rq = task_rq(p);
720+ while (unlikely(rq->curr == p)) {
721+ cpu_relax();
722+ barrier();
723+ }
724+ rq = lock_task_rq(p, &flags);
725+ if (unlikely(rq->curr == p)) {
726+ unlock_task_rq(rq, &flags);
727+ goto repeat;
728 }
729-
730- /*
731- * We know that the preferred CPU has a cache-affine current
732- * process, lets try to find a new idle CPU for the woken-up
733- * process. Select the least recently active idle CPU. (that
734- * one will have the least active cache context.) Also find
735- * the executing process which has the least priority.
736- */
737- oldest_idle = (cycles_t) -1;
738- target_tsk = NULL;
739- max_prio = 0;
740-
741- for (i = 0; i < smp_num_cpus; i++) {
742- cpu = cpu_logical_map(i);
743- if (!can_schedule(p, cpu))
744- continue;
745- tsk = cpu_curr(cpu);
746- /*
747- * We use the first available idle CPU. This creates
748- * a priority list between idle CPUs, but this is not
749- * a problem.
750- */
751- if (tsk == idle_task(cpu)) {
752-#if defined(__i386__) && defined(CONFIG_SMP)
753- /*
754- * Check if two siblings are idle in the same
755- * physical package. Use them if found.
756- */
757- if (smp_num_siblings == 2) {
758- if (cpu_curr(cpu_sibling_map[cpu]) ==
759- idle_task(cpu_sibling_map[cpu])) {
760- oldest_idle = last_schedule(cpu);
761- target_tsk = tsk;
762- break;
763- }
764-
765- }
766-#endif
767- if (last_schedule(cpu) < oldest_idle) {
768- oldest_idle = last_schedule(cpu);
769- target_tsk = tsk;
770- }
771- } else {
772- if (oldest_idle == -1ULL) {
773- int prio = preemption_goodness(tsk, p, cpu);
774-
775- if (prio > max_prio) {
776- max_prio = prio;
777- target_tsk = tsk;
778- }
779- }
780- }
781- }
782- tsk = target_tsk;
783- if (tsk) {
784- if (oldest_idle != -1ULL) {
785- best_cpu = tsk->processor;
786- goto send_now_idle;
787- }
788- tsk->need_resched = 1;
789- if (tsk->processor != this_cpu)
790- smp_send_reschedule(tsk->processor);
791- }
792- return;
793-
794-
795-#else /* UP */
796- int this_cpu = smp_processor_id();
797- struct task_struct *tsk;
798-
799- tsk = cpu_curr(this_cpu);
800- if (preemption_goodness(tsk, p, this_cpu) > 0)
801- tsk->need_resched = 1;
802-#endif
803+ unlock_task_rq(rq, &flags);
804 }
805
806 /*
807- * Careful!
808+ * The SMP message passing code calls this function whenever
809+ * the new task has arrived at the target CPU. We move the
810+ * new task into the local runqueue.
811 *
812- * This has to add the process to the _beginning_ of the
813- * run-queue, not the end. See the comment about "This is
814- * subtle" in the scheduler proper..
815+ * This function must be called with interrupts disabled.
816 */
817-static inline void add_to_runqueue(struct task_struct * p)
818+void sched_task_migrated(task_t *new_task)
819 {
820- list_add(&p->run_list, &runqueue_head);
821- nr_running++;
822+ wait_task_inactive(new_task);
823+ new_task->cpu = smp_processor_id();
824+ wake_up_process(new_task);
825 }
826
827-static inline void move_last_runqueue(struct task_struct * p)
828-{
829- list_del(&p->run_list);
830- list_add_tail(&p->run_list, &runqueue_head);
831-}
832-
833-static inline void move_first_runqueue(struct task_struct * p)
834+/*
835+ * Kick the remote CPU if the task is running currently,
836+ * this code is used by the signal code to signal tasks
837+ * which are in user-mode as quickly as possible.
838+ *
839+ * (Note that we do this lockless - if the task does anything
840+ * while the message is in flight then it will notice the
841+ * sigpending condition anyway.)
842+ */
843+void kick_if_running(task_t * p)
844 {
845- list_del(&p->run_list);
846- list_add(&p->run_list, &runqueue_head);
847+ if (p == task_rq(p)->curr)
848+ resched_task(p);
849 }
850+#endif
851
852 /*
853 * Wake up a process. Put it on the run-queue if it's not
854@@ -348,392 +264,472 @@
855 * "current->state = TASK_RUNNING" to mark yourself runnable
856 * without the overhead of this.
857 */
858-static inline int try_to_wake_up(struct task_struct * p, int synchronous)
859+static int try_to_wake_up(task_t * p, int synchronous)
860 {
861 unsigned long flags;
862 int success = 0;
863+ runqueue_t *rq;
864
865- /*
866- * We want the common case fall through straight, thus the goto.
867- */
868- spin_lock_irqsave(&runqueue_lock, flags);
869+ rq = lock_task_rq(p, &flags);
870 p->state = TASK_RUNNING;
871- if (task_on_runqueue(p))
872- goto out;
873- add_to_runqueue(p);
874- if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
875- reschedule_idle(p);
876- success = 1;
877-out:
878- spin_unlock_irqrestore(&runqueue_lock, flags);
879+ if (!p->array) {
880+ activate_task(p, rq);
881+ if ((rq->curr == rq->idle) || (p->prio < rq->curr->prio))
882+ resched_task(rq->curr);
883+ success = 1;
884+ }
885+ unlock_task_rq(rq, &flags);
886 return success;
887 }
888
889-inline int wake_up_process(struct task_struct * p)
890+int wake_up_process(task_t * p)
891 {
892 return try_to_wake_up(p, 0);
893 }
894
895-static void process_timeout(unsigned long __data)
896+void wake_up_forked_process(task_t * p)
897 {
898- struct task_struct * p = (struct task_struct *) __data;
899+ runqueue_t *rq = this_rq();
900
901- wake_up_process(p);
902+ p->state = TASK_RUNNING;
903+ if (!rt_task(p)) {
904+ current->sleep_avg = current->sleep_avg * PARENT_FORK_PENALTY / 100;
905+ p->sleep_avg = p->sleep_avg * CHILD_FORK_PENALTY / 100;
906+ p->prio = effective_prio(p);
907+ }
908+ spin_lock_irq(&rq->lock);
909+ p->cpu = smp_processor_id();
910+ activate_task(p, rq);
911+ spin_unlock_irq(&rq->lock);
912 }
913
914-/**
915- * schedule_timeout - sleep until timeout
916- * @timeout: timeout value in jiffies
917- *
918- * Make the current task sleep until @timeout jiffies have
919- * elapsed. The routine will return immediately unless
920- * the current task state has been set (see set_current_state()).
921- *
922- * You can set the task state as follows -
923- *
924- * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
925- * pass before the routine returns. The routine will return 0
926- *
927- * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
928- * delivered to the current task. In this case the remaining time
929- * in jiffies will be returned, or 0 if the timer expired in time
930- *
931- * The current task state is guaranteed to be TASK_RUNNING when this
932- * routine returns.
933- *
934- * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
935- * the CPU away without a bound on the timeout. In this case the return
936- * value will be %MAX_SCHEDULE_TIMEOUT.
937- *
938- * In all cases the return value is guaranteed to be non-negative.
939- */
940-signed long schedule_timeout(signed long timeout)
941+asmlinkage void schedule_tail(task_t *prev)
942 {
943- struct timer_list timer;
944- unsigned long expire;
945-
946- switch (timeout)
947- {
948- case MAX_SCHEDULE_TIMEOUT:
949- /*
950- * These two special cases are useful to be comfortable
951- * in the caller. Nothing more. We could take
952- * MAX_SCHEDULE_TIMEOUT from one of the negative value
953- * but I' d like to return a valid offset (>=0) to allow
954- * the caller to do everything it want with the retval.
955- */
956- schedule();
957- goto out;
958- default:
959- /*
960- * Another bit of PARANOID. Note that the retval will be
961- * 0 since no piece of kernel is supposed to do a check
962- * for a negative retval of schedule_timeout() (since it
963- * should never happens anyway). You just have the printk()
964- * that will tell you if something is gone wrong and where.
965- */
966- if (timeout < 0)
967- {
968- printk(KERN_ERR "schedule_timeout: wrong timeout "
969- "value %lx from %p\n", timeout,
970- __builtin_return_address(0));
971- current->state = TASK_RUNNING;
972- goto out;
973- }
974- }
975+ spin_unlock_irq(&this_rq()->lock);
976+}
977
978- expire = timeout + jiffies;
979+static inline void context_switch(task_t *prev, task_t *next)
980+{
981+ struct mm_struct *mm = next->mm;
982+ struct mm_struct *oldmm = prev->active_mm;
983
984- init_timer(&timer);
985- timer.expires = expire;
986- timer.data = (unsigned long) current;
987- timer.function = process_timeout;
988+ prepare_to_switch();
989
990- add_timer(&timer);
991- schedule();
992- del_timer_sync(&timer);
993+ if (unlikely(!mm)) {
994+ next->active_mm = oldmm;
995+ atomic_inc(&oldmm->mm_count);
996+ enter_lazy_tlb(oldmm, next, smp_processor_id());
997+ } else
998+ switch_mm(oldmm, mm, next, smp_processor_id());
999
1000- timeout = expire - jiffies;
1001+ if (unlikely(!prev->mm)) {
1002+ prev->active_mm = NULL;
1003+ mmdrop(oldmm);
1004+ }
1005
1006- out:
1007- return timeout < 0 ? 0 : timeout;
1008+ /*
1009+ * Here we just switch the register state and the stack. There are
1010+ * 3 processes affected by a context switch:
1011+ *
1012+ * prev ==> .... ==> (last => next)
1013+ *
1014+ * It's the 'much more previous' 'prev' that is on next's stack,
1015+ * but prev is set to (the just run) 'last' process by switch_to().
1016+ * This might sound slightly confusing but makes tons of sense.
1017+ */
1018+ switch_to(prev, next, prev);
1019 }
1020
1021-/*
1022- * schedule_tail() is getting called from the fork return path. This
1023- * cleans up all remaining scheduler things, without impacting the
1024- * common case.
1025- */
1026-static inline void __schedule_tail(struct task_struct *prev)
1027+unsigned long nr_running(void)
1028 {
1029-#ifdef CONFIG_SMP
1030- int policy;
1031-
1032- /*
1033- * prev->policy can be written from here only before `prev'
1034- * can be scheduled (before setting prev->cpus_runnable to ~0UL).
1035- * Of course it must also be read before allowing prev
1036- * to be rescheduled, but since the write depends on the read
1037- * to complete, wmb() is enough. (the spin_lock() acquired
1038- * before setting cpus_runnable is not enough because the spin_lock()
1039- * common code semantics allows code outside the critical section
1040- * to enter inside the critical section)
1041- */
1042- policy = prev->policy;
1043- prev->policy = policy & ~SCHED_YIELD;
1044- wmb();
1045+ unsigned long i, sum = 0;
1046
1047- /*
1048- * fast path falls through. We have to clear cpus_runnable before
1049- * checking prev->state to avoid a wakeup race. Protect against
1050- * the task exiting early.
1051- */
1052- task_lock(prev);
1053- task_release_cpu(prev);
1054- mb();
1055- if (prev->state == TASK_RUNNING)
1056- goto needs_resched;
1057+ for (i = 0; i < smp_num_cpus; i++)
1058+ sum += cpu_rq(cpu_logical_map(i))->nr_running;
1059
1060-out_unlock:
1061- task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */
1062- return;
1063+ return sum;
1064+}
1065
1066- /*
1067- * Slow path - we 'push' the previous process and
1068- * reschedule_idle() will attempt to find a new
1069- * processor for it. (but it might preempt the
1070- * current process as well.) We must take the runqueue
1071- * lock and re-check prev->state to be correct. It might
1072- * still happen that this process has a preemption
1073- * 'in progress' already - but this is not a problem and
1074- * might happen in other circumstances as well.
1075- */
1076-needs_resched:
1077- {
1078- unsigned long flags;
1079+unsigned long nr_context_switches(void)
1080+{
1081+ unsigned long i, sum = 0;
1082
1083- /*
1084- * Avoid taking the runqueue lock in cases where
1085- * no preemption-check is necessery:
1086- */
1087- if ((prev == idle_task(smp_processor_id())) ||
1088- (policy & SCHED_YIELD))
1089- goto out_unlock;
1090+ for (i = 0; i < smp_num_cpus; i++)
1091+ sum += cpu_rq(cpu_logical_map(i))->nr_switches;
1092
1093- spin_lock_irqsave(&runqueue_lock, flags);
1094- if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev))
1095- reschedule_idle(prev);
1096- spin_unlock_irqrestore(&runqueue_lock, flags);
1097- goto out_unlock;
1098- }
1099-#else
1100- prev->policy &= ~SCHED_YIELD;
1101-#endif /* CONFIG_SMP */
1102+ return sum;
1103 }
1104
1105-asmlinkage void schedule_tail(struct task_struct *prev)
1106+#if CONFIG_SMP
1107+/*
1108+ * Lock the busiest runqueue as well, this_rq is locked already.
1109+ * Recalculate nr_running if we have to drop the runqueue lock.
1110+ */
1111+static inline unsigned int double_lock_balance(runqueue_t *this_rq,
1112+ runqueue_t *busiest, int this_cpu, int idle, unsigned int nr_running)
1113 {
1114- __schedule_tail(prev);
1115+ if (unlikely(!spin_trylock(&busiest->lock))) {
1116+ if (busiest < this_rq) {
1117+ spin_unlock(&this_rq->lock);
1118+ spin_lock(&busiest->lock);
1119+ spin_lock(&this_rq->lock);
1120+ /* Need to recalculate nr_running */
1121+ if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
1122+ nr_running = this_rq->nr_running;
1123+ else
1124+ nr_running = this_rq->prev_nr_running[this_cpu];
1125+ } else
1126+ spin_lock(&busiest->lock);
1127+ }
1128+ return nr_running;
1129 }
1130
1131 /*
1132- * 'schedule()' is the scheduler function. It's a very simple and nice
1133- * scheduler: it's not perfect, but certainly works for most things.
1134- *
1135- * The goto is "interesting".
1136+ * Current runqueue is empty, or rebalance tick: if there is an
1137+ * inbalance (current runqueue is too short) then pull from
1138+ * busiest runqueue(s).
1139 *
1140- * NOTE!! Task 0 is the 'idle' task, which gets called when no other
1141- * tasks can run. It can not be killed, and it cannot sleep. The 'state'
1142- * information in task[0] is never used.
1143+ * We call this with the current runqueue locked,
1144+ * irqs disabled.
1145 */
1146-asmlinkage void schedule(void)
1147+static void load_balance(runqueue_t *this_rq, int idle)
1148 {
1149- struct schedule_data * sched_data;
1150- struct task_struct *prev, *next, *p;
1151- struct list_head *tmp;
1152- int this_cpu, c;
1153+ int imbalance, nr_running, load, max_load,
1154+ idx, i, this_cpu = smp_processor_id();
1155+ task_t *next = this_rq->idle, *tmp;
1156+ runqueue_t *busiest, *rq_src;
1157+ prio_array_t *array;
1158+ list_t *head, *curr;
1159
1160+ /*
1161+ * We search all runqueues to find the most busy one.
1162+ * We do this lockless to reduce cache-bouncing overhead,
1163+ * we re-check the 'best' source CPU later on again, with
1164+ * the lock held.
1165+ *
1166+ * We fend off statistical fluctuations in runqueue lengths by
1167+ * saving the runqueue length during the previous load-balancing
1168+ * operation and using the smaller one the current and saved lengths.
1169+ * If a runqueue is long enough for a longer amount of time then
1170+ * we recognize it and pull tasks from it.
1171+ *
1172+ * The 'current runqueue length' is a statistical maximum variable,
1173+ * for that one we take the longer one - to avoid fluctuations in
1174+ * the other direction. So for a load-balance to happen it needs
1175+ * stable long runqueue on the target CPU and stable short runqueue
1176+ * on the local runqueue.
1177+ *
1178+ * We make an exception if this CPU is about to become idle - in
1179+ * that case we are less picky about moving a task across CPUs and
1180+ * take what can be taken.
1181+ */
1182+ if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
1183+ nr_running = this_rq->nr_running;
1184+ else
1185+ nr_running = this_rq->prev_nr_running[this_cpu];
1186
1187- spin_lock_prefetch(&runqueue_lock);
1188+ busiest = NULL;
1189+ max_load = 1;
1190+ for (i = 0; i < smp_num_cpus; i++) {
1191+ rq_src = cpu_rq(cpu_logical_map(i));
1192+ if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i]))
1193+ load = rq_src->nr_running;
1194+ else
1195+ load = this_rq->prev_nr_running[i];
1196+ this_rq->prev_nr_running[i] = rq_src->nr_running;
1197+
1198+ if ((load > max_load) && (rq_src != this_rq)) {
1199+ busiest = rq_src;
1200+ max_load = load;
1201+ }
1202+ }
1203
1204- if (!current->active_mm) BUG();
1205-need_resched_back:
1206- prev = current;
1207- this_cpu = prev->processor;
1208+ if (likely(!busiest))
1209+ return;
1210
1211- if (unlikely(in_interrupt())) {
1212- printk("Scheduling in interrupt\n");
1213- BUG();
1214- }
1215+ imbalance = (max_load - nr_running) / 2;
1216
1217- release_kernel_lock(prev, this_cpu);
1218+ /* It needs an at least ~25% imbalance to trigger balancing. */
1219+ if (!idle && (imbalance < (max_load + 3)/4))
1220+ return;
1221
1222+ nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running);
1223 /*
1224- * 'sched_data' is protected by the fact that we can run
1225- * only one process per CPU.
1226+ * Make sure nothing changed since we checked the
1227+ * runqueue length.
1228 */
1229- sched_data = & aligned_data[this_cpu].schedule_data;
1230-
1231- spin_lock_irq(&runqueue_lock);
1232-
1233- /* move an exhausted RR process to be last.. */
1234- if (unlikely(prev->policy == SCHED_RR))
1235- if (!prev->counter) {
1236- prev->counter = NICE_TO_TICKS(prev->nice);
1237- move_last_runqueue(prev);
1238- }
1239-
1240- switch (prev->state) {
1241- case TASK_INTERRUPTIBLE:
1242- if (signal_pending(prev)) {
1243- prev->state = TASK_RUNNING;
1244- break;
1245- }
1246- default:
1247- del_from_runqueue(prev);
1248- case TASK_RUNNING:;
1249- }
1250- prev->need_resched = 0;
1251+ if (busiest->nr_running <= this_rq->nr_running + 1)
1252+ goto out_unlock;
1253
1254 /*
1255- * this is the scheduler proper:
1256+ * We first consider expired tasks. Those will likely not be
1257+ * executed in the near future, and they are most likely to
1258+ * be cache-cold, thus switching CPUs has the least effect
1259+ * on them.
1260 */
1261+ if (busiest->expired->nr_active)
1262+ array = busiest->expired;
1263+ else
1264+ array = busiest->active;
1265
1266-repeat_schedule:
1267+new_array:
1268 /*
1269- * Default process to select..
1270+ * Load-balancing does not affect RT tasks, so we start the
1271+ * searching at priority 128.
1272 */
1273- next = idle_task(this_cpu);
1274- c = -1000;
1275- list_for_each(tmp, &runqueue_head) {
1276- p = list_entry(tmp, struct task_struct, run_list);
1277- if (can_schedule(p, this_cpu)) {
1278- int weight = goodness(p, this_cpu, prev->active_mm);
1279- if (weight > c)
1280- c = weight, next = p;
1281+ idx = MAX_RT_PRIO;
1282+skip_bitmap:
1283+ idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1284+ if (idx == MAX_PRIO) {
1285+ if (array == busiest->expired) {
1286+ array = busiest->active;
1287+ goto new_array;
1288 }
1289+ goto out_unlock;
1290 }
1291
1292- /* Do we need to re-calculate counters? */
1293- if (unlikely(!c)) {
1294- struct task_struct *p;
1295-
1296- spin_unlock_irq(&runqueue_lock);
1297- read_lock(&tasklist_lock);
1298- for_each_task(p)
1299- p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
1300- read_unlock(&tasklist_lock);
1301- spin_lock_irq(&runqueue_lock);
1302- goto repeat_schedule;
1303+ head = array->queue + idx;
1304+ curr = head->prev;
1305+skip_queue:
1306+ tmp = list_entry(curr, task_t, run_list);
1307+
1308+ /*
1309+ * We do not migrate tasks that are:
1310+ * 1) running (obviously), or
1311+ * 2) cannot be migrated to this CPU due to cpus_allowed, or
1312+ * 3) are cache-hot on their current CPU.
1313+ */
1314+
1315+#define CAN_MIGRATE_TASK(p,rq,this_cpu) \
1316+ ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \
1317+ ((p) != (rq)->curr) && \
1318+ (tmp->cpus_allowed & (1 << (this_cpu))))
1319+
1320+ if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
1321+ curr = curr->next;
1322+ if (curr != head)
1323+ goto skip_queue;
1324+ idx++;
1325+ goto skip_bitmap;
1326+ }
1327+ next = tmp;
1328+ /*
1329+ * take the task out of the other runqueue and
1330+ * put it into this one:
1331+ */
1332+ dequeue_task(next, array);
1333+ busiest->nr_running--;
1334+ next->cpu = this_cpu;
1335+ this_rq->nr_running++;
1336+ enqueue_task(next, this_rq->active);
1337+ if (next->prio < current->prio)
1338+ current->need_resched = 1;
1339+ if (!idle && --imbalance) {
1340+ if (array == busiest->expired) {
1341+ array = busiest->active;
1342+ goto new_array;
1343+ }
1344 }
1345+out_unlock:
1346+ spin_unlock(&busiest->lock);
1347+}
1348+
1349+/*
1350+ * One of the idle_cpu_tick() or the busy_cpu_tick() function will
1351+ * gets called every timer tick, on every CPU. Our balancing action
1352+ * frequency and balancing agressivity depends on whether the CPU is
1353+ * idle or not.
1354+ *
1355+ * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on
1356+ * systems with HZ=100, every 10 msecs.)
1357+ */
1358+#define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
1359+#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
1360+
1361+static inline void idle_tick(void)
1362+{
1363+ if (jiffies % IDLE_REBALANCE_TICK)
1364+ return;
1365+ spin_lock(&this_rq()->lock);
1366+ load_balance(this_rq(), 1);
1367+ spin_unlock(&this_rq()->lock);
1368+}
1369+
1370+#endif
1371+
1372+/*
1373+ * This function gets called by the timer code, with HZ frequency.
1374+ * We call it with interrupts disabled.
1375+ */
1376+void scheduler_tick(task_t *p)
1377+{
1378+ runqueue_t *rq = this_rq();
1379+#if CONFIG_SMP
1380+ unsigned long now = jiffies;
1381
1382+ if (p == rq->idle)
1383+ return idle_tick();
1384+#endif
1385+ /* Task might have expired already, but not scheduled off yet */
1386+ if (p->array != rq->active) {
1387+ p->need_resched = 1;
1388+ return;
1389+ }
1390+ spin_lock(&rq->lock);
1391+ if (unlikely(rt_task(p))) {
1392+ /*
1393+ * RR tasks need a special form of timeslice management.
1394+ * FIFO tasks have no timeslices.
1395+ */
1396+ if ((p->policy == SCHED_RR) && !--p->time_slice) {
1397+ p->time_slice = NICE_TO_TIMESLICE(p->__nice);
1398+ p->need_resched = 1;
1399+
1400+ /* put it at the end of the queue: */
1401+ dequeue_task(p, rq->active);
1402+ enqueue_task(p, rq->active);
1403+ }
1404+ goto out;
1405+ }
1406 /*
1407- * from this point on nothing can prevent us from
1408- * switching to the next task, save this fact in
1409- * sched_data.
1410- */
1411- sched_data->curr = next;
1412- task_set_cpu(next, this_cpu);
1413- spin_unlock_irq(&runqueue_lock);
1414-
1415- if (unlikely(prev == next)) {
1416- /* We won't go through the normal tail, so do this by hand */
1417- prev->policy &= ~SCHED_YIELD;
1418- goto same_process;
1419+ * The task was running during this tick - update the
1420+ * time slice counter and the sleep average. Note: we
1421+ * do not update a process's priority until it either
1422+ * goes to sleep or uses up its timeslice. This makes
1423+ * it possible for interactive tasks to use up their
1424+ * timeslices at their highest priority levels.
1425+ */
1426+ if (p->sleep_avg)
1427+ p->sleep_avg--;
1428+ if (!--p->time_slice) {
1429+ dequeue_task(p, rq->active);
1430+ p->need_resched = 1;
1431+ p->prio = effective_prio(p);
1432+ p->time_slice = NICE_TO_TIMESLICE(p->__nice);
1433+
1434+ if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
1435+ if (!rq->expired_timestamp)
1436+ rq->expired_timestamp = jiffies;
1437+ enqueue_task(p, rq->expired);
1438+ } else
1439+ enqueue_task(p, rq->active);
1440 }
1441+out:
1442+#if CONFIG_SMP
1443+ if (!(now % BUSY_REBALANCE_TICK))
1444+ load_balance(rq, 0);
1445+#endif
1446+ spin_unlock(&rq->lock);
1447+}
1448
1449-#ifdef CONFIG_SMP
1450- /*
1451- * maintain the per-process 'last schedule' value.
1452- * (this has to be recalculated even if we reschedule to
1453- * the same process) Currently this is only used on SMP,
1454- * and it's approximate, so we do not have to maintain
1455- * it while holding the runqueue spinlock.
1456- */
1457- sched_data->last_schedule = get_cycles();
1458+void scheduling_functions_start_here(void) { }
1459
1460- /*
1461- * We drop the scheduler lock early (it's a global spinlock),
1462- * thus we have to lock the previous process from getting
1463- * rescheduled during switch_to().
1464- */
1465+/*
1466+ * 'schedule()' is the main scheduler function.
1467+ */
1468+asmlinkage void schedule(void)
1469+{
1470+ task_t *prev = current, *next;
1471+ runqueue_t *rq = this_rq();
1472+ prio_array_t *array;
1473+ list_t *queue;
1474+ int idx;
1475
1476-#endif /* CONFIG_SMP */
1477+ if (unlikely(in_interrupt()))
1478+ BUG();
1479+ release_kernel_lock(prev, smp_processor_id());
1480+ spin_lock_irq(&rq->lock);
1481
1482- kstat.context_swtch++;
1483- /*
1484- * there are 3 processes which are affected by a context switch:
1485- *
1486- * prev == .... ==> (last => next)
1487- *
1488- * It's the 'much more previous' 'prev' that is on next's stack,
1489- * but prev is set to (the just run) 'last' process by switch_to().
1490- * This might sound slightly confusing but makes tons of sense.
1491- */
1492- prepare_to_switch();
1493- {
1494- struct mm_struct *mm = next->mm;
1495- struct mm_struct *oldmm = prev->active_mm;
1496- if (!mm) {
1497- if (next->active_mm) BUG();
1498- next->active_mm = oldmm;
1499- atomic_inc(&oldmm->mm_count);
1500- enter_lazy_tlb(oldmm, next, this_cpu);
1501- } else {
1502- if (next->active_mm != mm) BUG();
1503- switch_mm(oldmm, mm, next, this_cpu);
1504+ switch (prev->state) {
1505+ case TASK_RUNNING:
1506+ prev->sleep_timestamp = jiffies;
1507+ break;
1508+ case TASK_INTERRUPTIBLE:
1509+ if (unlikely(signal_pending(prev))) {
1510+ prev->state = TASK_RUNNING;
1511+ prev->sleep_timestamp = jiffies;
1512+ break;
1513 }
1514+ default:
1515+ deactivate_task(prev, rq);
1516+ }
1517+#if CONFIG_SMP
1518+pick_next_task:
1519+#endif
1520+ if (unlikely(!rq->nr_running)) {
1521+#if CONFIG_SMP
1522+ load_balance(rq, 1);
1523+ if (rq->nr_running)
1524+ goto pick_next_task;
1525+#endif
1526+ next = rq->idle;
1527+ rq->expired_timestamp = 0;
1528+ goto switch_tasks;
1529+ }
1530
1531- if (!prev->mm) {
1532- prev->active_mm = NULL;
1533- mmdrop(oldmm);
1534- }
1535+ array = rq->active;
1536+ if (unlikely(!array->nr_active)) {
1537+ /*
1538+ * Switch the active and expired arrays.
1539+ */
1540+ rq->active = rq->expired;
1541+ rq->expired = array;
1542+ array = rq->active;
1543+ rq->expired_timestamp = 0;
1544 }
1545
1546- /*
1547- * This just switches the register state and the
1548- * stack.
1549- */
1550- switch_to(prev, next, prev);
1551- __schedule_tail(prev);
1552+ idx = sched_find_first_bit(array->bitmap);
1553+ queue = array->queue + idx;
1554+ next = list_entry(queue->next, task_t, run_list);
1555+
1556+switch_tasks:
1557+ prefetch(next);
1558+ prev->need_resched = 0;
1559+
1560+ if (likely(prev != next)) {
1561+ rq->nr_switches++;
1562+ rq->curr = next;
1563+ context_switch(prev, next);
1564+ /*
1565+ * The runqueue pointer might be from another CPU
1566+ * if the new task was last running on a different
1567+ * CPU - thus re-load it.
1568+ */
1569+ barrier();
1570+ rq = this_rq();
1571+ }
1572+ spin_unlock_irq(&rq->lock);
1573
1574-same_process:
1575 reacquire_kernel_lock(current);
1576- if (current->need_resched)
1577- goto need_resched_back;
1578 return;
1579 }
1580
1581 /*
1582- * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything
1583- * up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the
1584- * non-exclusive tasks and one exclusive task.
1585+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
1586+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
1587+ * number) then we wake all the non-exclusive tasks and one exclusive task.
1588 *
1589 * There are circumstances in which we can try to wake a task which has already
1590- * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero
1591- * in this (rare) case, and we handle it by contonuing to scan the queue.
1592+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
1593+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
1594 */
1595 static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
1596 int nr_exclusive, const int sync)
1597 {
1598 struct list_head *tmp;
1599- struct task_struct *p;
1600+ task_t *p;
1601
1602- CHECK_MAGIC_WQHEAD(q);
1603- WQ_CHECK_LIST_HEAD(&q->task_list);
1604-
1605 list_for_each(tmp,&q->task_list) {
1606 unsigned int state;
1607- wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
1608+ wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
1609
1610- CHECK_MAGIC(curr->__magic);
1611 p = curr->task;
1612 state = p->state;
1613- if (state & mode) {
1614- WQ_NOTE_WAKER(curr);
1615- if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
1616- break;
1617- }
1618+ if ((state & mode) &&
1619+ try_to_wake_up(p, sync) &&
1620+ ((curr->flags & WQ_FLAG_EXCLUSIVE) &&
1621+ !--nr_exclusive))
1622+ break;
1623 }
1624 }
1625
1626@@ -850,8 +846,70 @@
1627 return timeout;
1628 }
1629
1630+/*
1631+ * Change the current task's CPU affinity. Migrate the process to a
1632+ * proper CPU and schedule away if the current CPU is removed from
1633+ * the allowed bitmask.
1634+ */
1635+void set_cpus_allowed(task_t *p, unsigned long new_mask)
1636+{
1637+ new_mask &= cpu_online_map;
1638+ if (!new_mask)
1639+ BUG();
1640+
1641+ p->cpus_allowed = new_mask;
1642+ /*
1643+ * Can the task run on the current CPU? If not then
1644+ * migrate the process off to a proper CPU.
1645+ */
1646+ if (new_mask & (1UL << smp_processor_id()))
1647+ return;
1648+#if CONFIG_SMP
1649+ current->state = TASK_UNINTERRUPTIBLE;
1650+ smp_migrate_task(__ffs(new_mask), current);
1651+
1652+ schedule();
1653+#endif
1654+}
1655+
1656 void scheduling_functions_end_here(void) { }
1657
1658+void set_user_nice(task_t *p, long nice)
1659+{
1660+ unsigned long flags;
1661+ prio_array_t *array;
1662+ runqueue_t *rq;
1663+
1664+ if (p->__nice == nice)
1665+ return;
1666+ /*
1667+ * We have to be careful, if called from sys_setpriority(),
1668+ * the task might be in the middle of scheduling on another CPU.
1669+ */
1670+ rq = lock_task_rq(p, &flags);
1671+ if (rt_task(p)) {
1672+ p->__nice = nice;
1673+ goto out_unlock;
1674+ }
1675+ array = p->array;
1676+ if (array)
1677+ dequeue_task(p, array);
1678+ p->__nice = nice;
1679+ p->prio = NICE_TO_PRIO(nice);
1680+ if (array) {
1681+ enqueue_task(p, array);
1682+ /*
1683+ * If the task is running and lowered its priority,
1684+ * or increased its priority then reschedule its CPU:
1685+ */
1686+ if ((nice < p->__nice) ||
1687+ ((p->__nice < nice) && (p == rq->curr)))
1688+ resched_task(rq->curr);
1689+ }
1690+out_unlock:
1691+ unlock_task_rq(rq, &flags);
1692+}
1693+
1694 #ifndef __alpha__
1695
1696 /*
1697@@ -862,7 +920,7 @@
1698
1699 asmlinkage long sys_nice(int increment)
1700 {
1701- long newprio;
1702+ long nice;
1703
1704 /*
1705 * Setpriority might change our priority at the same moment.
1706@@ -878,32 +936,30 @@
1707 if (increment > 40)
1708 increment = 40;
1709
1710- newprio = current->nice + increment;
1711- if (newprio < -20)
1712- newprio = -20;
1713- if (newprio > 19)
1714- newprio = 19;
1715- current->nice = newprio;
1716+ nice = current->__nice + increment;
1717+ if (nice < -20)
1718+ nice = -20;
1719+ if (nice > 19)
1720+ nice = 19;
1721+ set_user_nice(current, nice);
1722 return 0;
1723 }
1724
1725 #endif
1726
1727-static inline struct task_struct *find_process_by_pid(pid_t pid)
1728+static inline task_t *find_process_by_pid(pid_t pid)
1729 {
1730- struct task_struct *tsk = current;
1731-
1732- if (pid)
1733- tsk = find_task_by_pid(pid);
1734- return tsk;
1735+ return pid ? find_task_by_pid(pid) : current;
1736 }
1737
1738-static int setscheduler(pid_t pid, int policy,
1739- struct sched_param *param)
1740+static int setscheduler(pid_t pid, int policy, struct sched_param *param)
1741 {
1742 struct sched_param lp;
1743- struct task_struct *p;
1744+ prio_array_t *array;
1745+ unsigned long flags;
1746+ runqueue_t *rq;
1747 int retval;
1748+ task_t *p;
1749
1750 retval = -EINVAL;
1751 if (!param || pid < 0)
1752@@ -917,14 +973,19 @@
1753 * We play safe to avoid deadlocks.
1754 */
1755 read_lock_irq(&tasklist_lock);
1756- spin_lock(&runqueue_lock);
1757
1758 p = find_process_by_pid(pid);
1759
1760 retval = -ESRCH;
1761 if (!p)
1762- goto out_unlock;
1763-
1764+ goto out_unlock_tasklist;
1765+
1766+ /*
1767+ * To be able to change p->policy safely, the apropriate
1768+ * runqueue lock must be held.
1769+ */
1770+ rq = lock_task_rq(p, &flags);
1771+
1772 if (policy < 0)
1773 policy = p->policy;
1774 else {
1775@@ -945,30 +1006,36 @@
1776 goto out_unlock;
1777
1778 retval = -EPERM;
1779- if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
1780+ if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
1781 !capable(CAP_SYS_NICE))
1782 goto out_unlock;
1783 if ((current->euid != p->euid) && (current->euid != p->uid) &&
1784 !capable(CAP_SYS_NICE))
1785 goto out_unlock;
1786
1787+ array = p->array;
1788+ if (array)
1789+ deactivate_task(p, task_rq(p));
1790 retval = 0;
1791 p->policy = policy;
1792 p->rt_priority = lp.sched_priority;
1793- if (task_on_runqueue(p))
1794- move_first_runqueue(p);
1795-
1796- current->need_resched = 1;
1797+ if (rt_task(p))
1798+ p->prio = 99-p->rt_priority;
1799+ else
1800+ p->prio = NICE_TO_PRIO(p->__nice);
1801+ if (array)
1802+ activate_task(p, task_rq(p));
1803
1804 out_unlock:
1805- spin_unlock(&runqueue_lock);
1806+ unlock_task_rq(rq, &flags);
1807+out_unlock_tasklist:
1808 read_unlock_irq(&tasklist_lock);
1809
1810 out_nounlock:
1811 return retval;
1812 }
1813
1814-asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
1815+asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
1816 struct sched_param *param)
1817 {
1818 return setscheduler(pid, policy, param);
1819@@ -981,7 +1048,7 @@
1820
1821 asmlinkage long sys_sched_getscheduler(pid_t pid)
1822 {
1823- struct task_struct *p;
1824+ task_t *p;
1825 int retval;
1826
1827 retval = -EINVAL;
1828@@ -992,7 +1059,7 @@
1829 read_lock(&tasklist_lock);
1830 p = find_process_by_pid(pid);
1831 if (p)
1832- retval = p->policy & ~SCHED_YIELD;
1833+ retval = p->policy;
1834 read_unlock(&tasklist_lock);
1835
1836 out_nounlock:
1837@@ -1001,7 +1068,7 @@
1838
1839 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
1840 {
1841- struct task_struct *p;
1842+ task_t *p;
1843 struct sched_param lp;
1844 int retval;
1845
1846@@ -1032,42 +1099,38 @@
1847
1848 asmlinkage long sys_sched_yield(void)
1849 {
1850+ runqueue_t *rq = this_rq();
1851+ prio_array_t *array;
1852+
1853 /*
1854- * Trick. sched_yield() first counts the number of truly
1855- * 'pending' runnable processes, then returns if it's
1856- * only the current processes. (This test does not have
1857- * to be atomic.) In threaded applications this optimization
1858- * gets triggered quite often.
1859+ * Decrease the yielding task's priority by one, to avoid
1860+ * livelocks. This priority loss is temporary, it's recovered
1861+ * once the current timeslice expires.
1862+ *
1863+ * If priority is already MAX_PRIO-1 then we still
1864+ * roundrobin the task within the runlist.
1865 */
1866-
1867- int nr_pending = nr_running;
1868-
1869-#if CONFIG_SMP
1870- int i;
1871-
1872- // Subtract non-idle processes running on other CPUs.
1873- for (i = 0; i < smp_num_cpus; i++) {
1874- int cpu = cpu_logical_map(i);
1875- if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
1876- nr_pending--;
1877+ spin_lock_irq(&rq->lock);
1878+ array = current->array;
1879+ /*
1880+ * If the task has reached maximum priority (or is a RT task)
1881+ * then just requeue the task to the end of the runqueue:
1882+ */
1883+ if (likely(current->prio == MAX_PRIO-1 || rt_task(current))) {
1884+ list_del(&current->run_list);
1885+ list_add_tail(&current->run_list, array->queue + current->prio);
1886+ } else {
1887+ list_del(&current->run_list);
1888+ if (list_empty(array->queue + current->prio))
1889+ __clear_bit(current->prio, array->bitmap);
1890+ current->prio++;
1891+ list_add_tail(&current->run_list, array->queue + current->prio);
1892+ __set_bit(current->prio, array->bitmap);
1893 }
1894-#else
1895- // on UP this process is on the runqueue as well
1896- nr_pending--;
1897-#endif
1898- if (nr_pending) {
1899- /*
1900- * This process can only be rescheduled by us,
1901- * so this is safe without any locking.
1902- */
1903- if (current->policy == SCHED_OTHER)
1904- current->policy |= SCHED_YIELD;
1905- current->need_resched = 1;
1906+ spin_unlock(&rq->lock);
1907+
1908+ schedule();
1909
1910- spin_lock_irq(&runqueue_lock);
1911- move_last_runqueue(current);
1912- spin_unlock_irq(&runqueue_lock);
1913- }
1914 return 0;
1915 }
1916
1917@@ -1105,7 +1168,7 @@
1918 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
1919 {
1920 struct timespec t;
1921- struct task_struct *p;
1922+ task_t *p;
1923 int retval = -EINVAL;
1924
1925 if (pid < 0)
1926@@ -1115,8 +1178,8 @@
1927 read_lock(&tasklist_lock);
1928 p = find_process_by_pid(pid);
1929 if (p)
1930- jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice),
1931- &t);
1932+ jiffies_to_timespec(p->policy & SCHED_FIFO ?
1933+ 0 : NICE_TO_TIMESLICE(p->__nice), &t);
1934 read_unlock(&tasklist_lock);
1935 if (p)
1936 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
1937@@ -1124,14 +1187,14 @@
1938 return retval;
1939 }
1940
1941-static void show_task(struct task_struct * p)
1942+static void show_task(task_t * p)
1943 {
1944 unsigned long free = 0;
1945 int state;
1946 static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
1947
1948 printk("%-13.13s ", p->comm);
1949- state = p->state ? ffz(~p->state) + 1 : 0;
1950+ state = p->state ? __ffs(p->state) + 1 : 0;
1951 if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
1952 printk(stat_nam[state]);
1953 else
1954@@ -1172,7 +1235,7 @@
1955 printk(" (NOTLB)\n");
1956
1957 {
1958- extern void show_trace_task(struct task_struct *tsk);
1959+ extern void show_trace_task(task_t *tsk);
1960 show_trace_task(p);
1961 }
1962 }
1963@@ -1194,7 +1257,7 @@
1964
1965 void show_state(void)
1966 {
1967- struct task_struct *p;
1968+ task_t *p;
1969
1970 #if (BITS_PER_LONG == 32)
1971 printk("\n"
1972@@ -1217,121 +1280,88 @@
1973 read_unlock(&tasklist_lock);
1974 }
1975
1976-/**
1977- * reparent_to_init() - Reparent the calling kernel thread to the init task.
1978- *
1979- * If a kernel thread is launched as a result of a system call, or if
1980- * it ever exits, it should generally reparent itself to init so that
1981- * it is correctly cleaned up on exit.
1982- *
1983- * The various task state such as scheduling policy and priority may have
1984- * been inherited fro a user process, so we reset them to sane values here.
1985- *
1986- * NOTE that reparent_to_init() gives the caller full capabilities.
1987- */
1988-void reparent_to_init(void)
1989+static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1990 {
1991- struct task_struct *this_task = current;
1992-
1993- write_lock_irq(&tasklist_lock);
1994-
1995- /* Reparent to init */
1996- REMOVE_LINKS(this_task);
1997- this_task->p_pptr = child_reaper;
1998- this_task->p_opptr = child_reaper;
1999- SET_LINKS(this_task);
2000-
2001- /* Set the exit signal to SIGCHLD so we signal init on exit */
2002- this_task->exit_signal = SIGCHLD;
2003-
2004- /* We also take the runqueue_lock while altering task fields
2005- * which affect scheduling decisions */
2006- spin_lock(&runqueue_lock);
2007-
2008- this_task->ptrace = 0;
2009- this_task->nice = DEF_NICE;
2010- this_task->policy = SCHED_OTHER;
2011- /* cpus_allowed? */
2012- /* rt_priority? */
2013- /* signals? */
2014- this_task->cap_effective = CAP_INIT_EFF_SET;
2015- this_task->cap_inheritable = CAP_INIT_INH_SET;
2016- this_task->cap_permitted = CAP_FULL_SET;
2017- this_task->keep_capabilities = 0;
2018- memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim)));
2019- this_task->user = INIT_USER;
2020-
2021- spin_unlock(&runqueue_lock);
2022- write_unlock_irq(&tasklist_lock);
2023+ if (rq1 == rq2)
2024+ spin_lock(&rq1->lock);
2025+ else {
2026+ if (rq1 < rq2) {
2027+ spin_lock(&rq1->lock);
2028+ spin_lock(&rq2->lock);
2029+ } else {
2030+ spin_lock(&rq2->lock);
2031+ spin_lock(&rq1->lock);
2032+ }
2033+ }
2034 }
2035
2036-/*
2037- * Put all the gunge required to become a kernel thread without
2038- * attached user resources in one place where it belongs.
2039- */
2040-
2041-void daemonize(void)
2042+static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
2043 {
2044- struct fs_struct *fs;
2045-
2046-
2047- /*
2048- * If we were started as result of loading a module, close all of the
2049- * user space pages. We don't need them, and if we didn't close them
2050- * they would be locked into memory.
2051- */
2052- exit_mm(current);
2053-
2054- current->session = 1;
2055- current->pgrp = 1;
2056- current->tty = NULL;
2057-
2058- /* Become as one with the init task */
2059-
2060- exit_fs(current); /* current->fs->count--; */
2061- fs = init_task.fs;
2062- current->fs = fs;
2063- atomic_inc(&fs->count);
2064- exit_files(current);
2065- current->files = init_task.files;
2066- atomic_inc(&current->files->count);
2067+ spin_unlock(&rq1->lock);
2068+ if (rq1 != rq2)
2069+ spin_unlock(&rq2->lock);
2070 }
2071
2072-extern unsigned long wait_init_idle;
2073-
2074-void __init init_idle(void)
2075+void __init init_idle(task_t *idle, int cpu)
2076 {
2077- struct schedule_data * sched_data;
2078- sched_data = &aligned_data[smp_processor_id()].schedule_data;
2079+ runqueue_t *idle_rq = cpu_rq(cpu), *rq = idle->array->rq;
2080+ unsigned long flags;
2081
2082- if (current != &init_task && task_on_runqueue(current)) {
2083- printk("UGH! (%d:%d) was on the runqueue, removing.\n",
2084- smp_processor_id(), current->pid);
2085- del_from_runqueue(current);
2086- }
2087- sched_data->curr = current;
2088- sched_data->last_schedule = get_cycles();
2089- clear_bit(current->processor, &wait_init_idle);
2090+ __save_flags(flags);
2091+ __cli();
2092+ double_rq_lock(idle_rq, rq);
2093+
2094+ idle_rq->curr = idle_rq->idle = idle;
2095+ deactivate_task(idle, rq);
2096+ idle->array = NULL;
2097+ idle->prio = MAX_PRIO;
2098+ idle->state = TASK_RUNNING;
2099+ idle->cpu = cpu;
2100+ double_rq_unlock(idle_rq, rq);
2101+ idle->need_resched = 1;
2102+ __restore_flags(flags);
2103 }
2104
2105-extern void init_timervecs (void);
2106+extern void init_timervecs(void);
2107+extern void timer_bh(void);
2108+extern void tqueue_bh(void);
2109+extern void immediate_bh(void);
2110
2111 void __init sched_init(void)
2112 {
2113+ runqueue_t *rq;
2114+ int i, j, k;
2115+
2116+ for (i = 0; i < NR_CPUS; i++) {
2117+ runqueue_t *rq = cpu_rq(i);
2118+ prio_array_t *array;
2119+
2120+ rq->active = rq->arrays + 0;
2121+ rq->expired = rq->arrays + 1;
2122+ spin_lock_init(&rq->lock);
2123+
2124+ for (j = 0; j < 2; j++) {
2125+ array = rq->arrays + j;
2126+ array->rq = rq;
2127+ array->lock = &rq->lock;
2128+ for (k = 0; k < MAX_PRIO; k++) {
2129+ INIT_LIST_HEAD(array->queue + k);
2130+ __clear_bit(k, array->bitmap);
2131+ }
2132+ // delimiter for bitsearch
2133+ __set_bit(MAX_PRIO, array->bitmap);
2134+ }
2135+ }
2136 /*
2137 * We have to do a little magic to get the first
2138 * process right in SMP mode.
2139 */
2140- int cpu = smp_processor_id();
2141- int nr;
2142-
2143- init_task.processor = cpu;
2144-
2145- for(nr = 0; nr < PIDHASH_SZ; nr++)
2146- pidhash[nr] = NULL;
2147+ rq = this_rq();
2148+ rq->curr = current;
2149+ rq->idle = current;
2150+ wake_up_process(current);
2151
2152 init_timervecs();
2153-
2154 init_bh(TIMER_BH, timer_bh);
2155 init_bh(TQUEUE_BH, tqueue_bh);
2156 init_bh(IMMEDIATE_BH, immediate_bh);
2157@@ -1340,5 +1370,5 @@
2158 * The boot idle thread does lazy MMU switching as well:
2159 */
2160 atomic_inc(&init_mm.mm_count);
2161- enter_lazy_tlb(&init_mm, current, cpu);
2162+ enter_lazy_tlb(&init_mm, current, smp_processor_id());
2163 }
2164--- linux/kernel/exit.c.orig Sun Jan 6 13:55:56 2002
2165+++ linux/kernel/exit.c Mon Jan 28 18:01:36 2002
2166@@ -27,49 +27,42 @@
2167
2168 static void release_task(struct task_struct * p)
2169 {
2170- if (p != current) {
2171+ unsigned long flags;
2172+
2173+ if (p == current)
2174+ BUG();
2175 #ifdef CONFIG_SMP
2176- /*
2177- * Wait to make sure the process isn't on the
2178- * runqueue (active on some other CPU still)
2179- */
2180- for (;;) {
2181- task_lock(p);
2182- if (!task_has_cpu(p))
2183- break;
2184- task_unlock(p);
2185- do {
2186- cpu_relax();
2187- barrier();
2188- } while (task_has_cpu(p));
2189- }
2190- task_unlock(p);
2191+ wait_task_inactive(p);
2192 #endif
2193- atomic_dec(&p->user->processes);
2194- free_uid(p->user);
2195- unhash_process(p);
2196-
2197- release_thread(p);
2198- current->cmin_flt += p->min_flt + p->cmin_flt;
2199- current->cmaj_flt += p->maj_flt + p->cmaj_flt;
2200- current->cnswap += p->nswap + p->cnswap;
2201- /*
2202- * Potentially available timeslices are retrieved
2203- * here - this way the parent does not get penalized
2204- * for creating too many processes.
2205- *
2206- * (this cannot be used to artificially 'generate'
2207- * timeslices, because any timeslice recovered here
2208- * was given away by the parent in the first place.)
2209- */
2210- current->counter += p->counter;
2211- if (current->counter >= MAX_COUNTER)
2212- current->counter = MAX_COUNTER;
2213- p->pid = 0;
2214- free_task_struct(p);
2215- } else {
2216- printk("task releasing itself\n");
2217- }
2218+ atomic_dec(&p->user->processes);
2219+ free_uid(p->user);
2220+ unhash_process(p);
2221+
2222+ release_thread(p);
2223+ current->cmin_flt += p->min_flt + p->cmin_flt;
2224+ current->cmaj_flt += p->maj_flt + p->cmaj_flt;
2225+ current->cnswap += p->nswap + p->cnswap;
2226+ /*
2227+ * Potentially available timeslices are retrieved
2228+ * here - this way the parent does not get penalized
2229+ * for creating too many processes.
2230+ *
2231+ * (this cannot be used to artificially 'generate'
2232+ * timeslices, because any timeslice recovered here
2233+ * was given away by the parent in the first place.)
2234+ */
2235+ __save_flags(flags);
2236+ __cli();
2237+ current->time_slice += p->time_slice;
2238+ if (current->time_slice > MAX_TIMESLICE)
2239+ current->time_slice = MAX_TIMESLICE;
2240+ if (p->sleep_avg < current->sleep_avg)
2241+ current->sleep_avg = (current->sleep_avg * EXIT_WEIGHT +
2242+ p->sleep_avg) / (EXIT_WEIGHT + 1);
2243+ __restore_flags(flags);
2244+
2245+ p->pid = 0;
2246+ free_task_struct(p);
2247 }
2248
2249 /*
2250@@ -147,6 +140,79 @@
2251 }
2252 read_unlock(&tasklist_lock);
2253 return retval;
2254+}
2255+
2256+/**
2257+ * reparent_to_init() - Reparent the calling kernel thread to the init task.
2258+ *
2259+ * If a kernel thread is launched as a result of a system call, or if
2260+ * it ever exits, it should generally reparent itself to init so that
2261+ * it is correctly cleaned up on exit.
2262+ *
2263+ * The various task state such as scheduling policy and priority may have
2264+ * been inherited from a user process, so we reset them to sane values here.
2265+ *
2266+ * NOTE that reparent_to_init() gives the caller full capabilities.
2267+ */
2268+void reparent_to_init(void)
2269+{
2270+ write_lock_irq(&tasklist_lock);
2271+
2272+ /* Reparent to init */
2273+ REMOVE_LINKS(current);
2274+ current->p_pptr = child_reaper;
2275+ current->p_opptr = child_reaper;
2276+ SET_LINKS(current);
2277+
2278+ /* Set the exit signal to SIGCHLD so we signal init on exit */
2279+ current->exit_signal = SIGCHLD;
2280+
2281+ current->ptrace = 0;
2282+ if ((current->policy == SCHED_OTHER) && (current->__nice < DEF_USER_NICE))
2283+ set_user_nice(current, DEF_USER_NICE);
2284+ /* cpus_allowed? */
2285+ /* rt_priority? */
2286+ /* signals? */
2287+ current->cap_effective = CAP_INIT_EFF_SET;
2288+ current->cap_inheritable = CAP_INIT_INH_SET;
2289+ current->cap_permitted = CAP_FULL_SET;
2290+ current->keep_capabilities = 0;
2291+ memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
2292+ current->user = INIT_USER;
2293+
2294+ write_unlock_irq(&tasklist_lock);
2295+}
2296+
2297+/*
2298+ * Put all the gunge required to become a kernel thread without
2299+ * attached user resources in one place where it belongs.
2300+ */
2301+
2302+void daemonize(void)
2303+{
2304+ struct fs_struct *fs;
2305+
2306+
2307+ /*
2308+ * If we were started as result of loading a module, close all of the
2309+ * user space pages. We don't need them, and if we didn't close them
2310+ * they would be locked into memory.
2311+ */
2312+ exit_mm(current);
2313+
2314+ current->session = 1;
2315+ current->pgrp = 1;
2316+ current->tty = NULL;
2317+
2318+ /* Become as one with the init task */
2319+
2320+ exit_fs(current); /* current->fs->count--; */
2321+ fs = init_task.fs;
2322+ current->fs = fs;
2323+ atomic_inc(&fs->count);
2324+ exit_files(current);
2325+ current->files = init_task.files;
2326+ atomic_inc(&current->files->count);
2327 }
2328
2329 /*
2330--- linux/kernel/capability.c.orig Sat Jun 24 06:06:37 2000
2331+++ linux/kernel/capability.c Sun Jan 6 13:56:25 2002
2332@@ -8,6 +8,8 @@
2333 #include <linux/mm.h>
2334 #include <asm/uaccess.h>
2335
2336+unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
2337+
2338 kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
2339
2340 /* Note: never hold tasklist_lock while spinning for this one */
2341--- linux/kernel/timer.c.orig Sun Jan 6 13:55:49 2002
2342+++ linux/kernel/timer.c Mon Jan 21 12:53:05 2002
2343@@ -25,6 +25,8 @@
2344
2345 #include <asm/uaccess.h>
2346
2347+struct kernel_stat kstat;
2348+
2349 /*
2350 * Timekeeping variables
2351 */
2352@@ -583,17 +585,16 @@
2353
2354 update_one_process(p, user_tick, system, cpu);
2355 if (p->pid) {
2356- if (--p->counter <= 0) {
2357- p->counter = 0;
2358- p->need_resched = 1;
2359- }
2360- if (p->nice > 0)
2361+ if (p->__nice > 0)
2362 kstat.per_cpu_nice[cpu] += user_tick;
2363 else
2364 kstat.per_cpu_user[cpu] += user_tick;
2365 kstat.per_cpu_system[cpu] += system;
2366- } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
2367- kstat.per_cpu_system[cpu] += system;
2368+ } else {
2369+ if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
2370+ kstat.per_cpu_system[cpu] += system;
2371+ }
2372+ scheduler_tick(p);
2373 }
2374
2375 /*
2376@@ -794,6 +795,89 @@
2377
2378 #endif
2379
2380+static void process_timeout(unsigned long __data)
2381+{
2382+ wake_up_process((task_t *)__data);
2383+}
2384+
2385+/**
2386+ * schedule_timeout - sleep until timeout
2387+ * @timeout: timeout value in jiffies
2388+ *
2389+ * Make the current task sleep until @timeout jiffies have
2390+ * elapsed. The routine will return immediately unless
2391+ * the current task state has been set (see set_current_state()).
2392+ *
2393+ * You can set the task state as follows -
2394+ *
2395+ * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
2396+ * pass before the routine returns. The routine will return 0
2397+ *
2398+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
2399+ * delivered to the current task. In this case the remaining time
2400+ * in jiffies will be returned, or 0 if the timer expired in time
2401+ *
2402+ * The current task state is guaranteed to be TASK_RUNNING when this
2403+ * routine returns.
2404+ *
2405+ * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
2406+ * the CPU away without a bound on the timeout. In this case the return
2407+ * value will be %MAX_SCHEDULE_TIMEOUT.
2408+ *
2409+ * In all cases the return value is guaranteed to be non-negative.
2410+ */
2411+signed long schedule_timeout(signed long timeout)
2412+{
2413+ struct timer_list timer;
2414+ unsigned long expire;
2415+
2416+ switch (timeout)
2417+ {
2418+ case MAX_SCHEDULE_TIMEOUT:
2419+ /*
2420+ * These two special cases are useful to be comfortable
2421+ * in the caller. Nothing more. We could take
2422+ * MAX_SCHEDULE_TIMEOUT from one of the negative value
2423+ * but I' d like to return a valid offset (>=0) to allow
2424+ * the caller to do everything it want with the retval.
2425+ */
2426+ schedule();
2427+ goto out;
2428+ default:
2429+ /*
2430+ * Another bit of PARANOID. Note that the retval will be
2431+ * 0 since no piece of kernel is supposed to do a check
2432+ * for a negative retval of schedule_timeout() (since it
2433+ * should never happens anyway). You just have the printk()
2434+ * that will tell you if something is gone wrong and where.
2435+ */
2436+ if (timeout < 0)
2437+ {
2438+ printk(KERN_ERR "schedule_timeout: wrong timeout "
2439+ "value %lx from %p\n", timeout,
2440+ __builtin_return_address(0));
2441+ current->state = TASK_RUNNING;
2442+ goto out;
2443+ }
2444+ }
2445+
2446+ expire = timeout + jiffies;
2447+
2448+ init_timer(&timer);
2449+ timer.expires = expire;
2450+ timer.data = (unsigned long) current;
2451+ timer.function = process_timeout;
2452+
2453+ add_timer(&timer);
2454+ schedule();
2455+ del_timer_sync(&timer);
2456+
2457+ timeout = expire - jiffies;
2458+
2459+ out:
2460+ return timeout < 0 ? 0 : timeout;
2461+}
2462+
2463 /* Thread ID - the internal kernel "pid" */
2464 asmlinkage long sys_gettid(void)
2465 {
2466@@ -840,4 +924,3 @@
2467 }
2468 return 0;
2469 }
2470-
2471--- linux/kernel/fork.c.orig Sun Jan 6 13:55:56 2002
2472+++ linux/kernel/fork.c Thu Jan 24 13:45:09 2002
2473@@ -28,7 +28,6 @@
2474
2475 /* The idle threads do not count.. */
2476 int nr_threads;
2477-int nr_running;
2478
2479 int max_threads;
2480 unsigned long total_forks; /* Handle normal Linux uptimes. */
2481@@ -36,6 +35,8 @@
2482
2483 struct task_struct *pidhash[PIDHASH_SZ];
2484
2485+rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */
2486+
2487 void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
2488 {
2489 unsigned long flags;
2490@@ -563,6 +564,7 @@
2491 struct pt_regs *regs, unsigned long stack_size)
2492 {
2493 int retval;
2494+ unsigned long flags;
2495 struct task_struct *p;
2496 struct completion vfork;
2497
2498@@ -611,8 +613,7 @@
2499 copy_flags(clone_flags, p);
2500 p->pid = get_pid(clone_flags);
2501
2502- p->run_list.next = NULL;
2503- p->run_list.prev = NULL;
2504+ INIT_LIST_HEAD(&p->run_list);
2505
2506 p->p_cptr = NULL;
2507 init_waitqueue_head(&p->wait_chldexit);
2508@@ -638,14 +639,15 @@
2509 #ifdef CONFIG_SMP
2510 {
2511 int i;
2512- p->cpus_runnable = ~0UL;
2513- p->processor = current->processor;
2514+
2515 /* ?? should we just memset this ?? */
2516 for(i = 0; i < smp_num_cpus; i++)
2517- p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
2518+ p->per_cpu_utime[cpu_logical_map(i)] =
2519+ p->per_cpu_stime[cpu_logical_map(i)] = 0;
2520 spin_lock_init(&p->sigmask_lock);
2521 }
2522 #endif
2523+ p->array = NULL;
2524 p->lock_depth = -1; /* -1 = no lock */
2525 p->start_time = jiffies;
2526
2527@@ -677,15 +679,27 @@
2528 p->pdeath_signal = 0;
2529
2530 /*
2531- * "share" dynamic priority between parent and child, thus the
2532- * total amount of dynamic priorities in the system doesnt change,
2533- * more scheduling fairness. This is only important in the first
2534- * timeslice, on the long run the scheduling behaviour is unchanged.
2535+ * Share the timeslice between parent and child, thus the
2536+ * total amount of pending timeslices in the system doesnt change,
2537+ * resulting in more scheduling fairness.
2538 */
2539- p->counter = (current->counter + 1) >> 1;
2540- current->counter >>= 1;
2541- if (!current->counter)
2542- current->need_resched = 1;
2543+ __save_flags(flags);
2544+ __cli();
2545+ if (!current->time_slice)
2546+ BUG();
2547+ p->time_slice = (current->time_slice + 1) >> 1;
2548+ current->time_slice >>= 1;
2549+ if (!current->time_slice) {
2550+ /*
2551+ * This case is rare, it happens when the parent has only
2552+ * a single jiffy left from its timeslice. Taking the
2553+ * runqueue lock is not a problem.
2554+ */
2555+ current->time_slice = 1;
2556+ scheduler_tick(current);
2557+ }
2558+ p->sleep_timestamp = jiffies;
2559+ __restore_flags(flags);
2560
2561 /*
2562 * Ok, add it to the run-queues and make it
2563@@ -722,10 +736,23 @@
2564 if (p->ptrace & PT_PTRACED)
2565 send_sig(SIGSTOP, p, 1);
2566
2567+#define RUN_CHILD_FIRST 1
2568+#if RUN_CHILD_FIRST
2569+ wake_up_forked_process(p); /* do this last */
2570+#else
2571 wake_up_process(p); /* do this last */
2572+#endif
2573 ++total_forks;
2574 if (clone_flags & CLONE_VFORK)
2575 wait_for_completion(&vfork);
2576+#if RUN_CHILD_FIRST
2577+ else
2578+ /*
2579+ * Let the child process run first, to avoid most of the
2580+ * COW overhead when the child exec()s afterwards.
2581+ */
2582+ current->need_resched = 1;
2583+#endif
2584
2585 fork_out:
2586 return retval;
2587--- linux/kernel/softirq.c.orig Sun Jan 6 13:55:53 2002
2588+++ linux/kernel/softirq.c Wed Jan 16 00:52:11 2002
2589@@ -259,10 +259,9 @@
2590
2591 while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
2592 current->state = TASK_RUNNING;
2593- do {
2594- current->policy |= SCHED_YIELD;
2595- schedule();
2596- } while (test_bit(TASKLET_STATE_SCHED, &t->state));
2597+ do
2598+ sys_sched_yield();
2599+ while (test_bit(TASKLET_STATE_SCHED, &t->state));
2600 }
2601 tasklet_unlock_wait(t);
2602 clear_bit(TASKLET_STATE_SCHED, &t->state);
2603@@ -365,13 +364,13 @@
2604 int cpu = cpu_logical_map(bind_cpu);
2605
2606 daemonize();
2607- current->nice = 19;
2608+ set_user_nice(current, 19);
2609 sigfillset(&current->blocked);
2610
2611 /* Migrate to the right CPU */
2612- current->cpus_allowed = 1UL << cpu;
2613- while (smp_processor_id() != cpu)
2614- schedule();
2615+ set_cpus_allowed(current, 1UL << cpu);
2616+ if (cpu() != cpu)
2617+ BUG();
2618
2619 sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu);
2620
2621@@ -396,7 +395,7 @@
2622 }
2623 }
2624
2625-static __init int spawn_ksoftirqd(void)
2626+__init int spawn_ksoftirqd(void)
2627 {
2628 int cpu;
2629
2630@@ -405,14 +404,12 @@
2631 CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0)
2632 printk("spawn_ksoftirqd() failed for cpu %d\n", cpu);
2633 else {
2634- while (!ksoftirqd_task(cpu_logical_map(cpu))) {
2635- current->policy |= SCHED_YIELD;
2636- schedule();
2637- }
2638+ while (!ksoftirqd_task(cpu_logical_map(cpu)))
2639+ sys_sched_yield();
2640 }
2641 }
2642
2643 return 0;
2644 }
2645
2646-__initcall(spawn_ksoftirqd);
2647+__initcall(spawn_ksoftirqd);
2648--- linux/kernel/ptrace.c.orig Sun Jan 6 13:55:57 2002
2649+++ linux/kernel/ptrace.c Sun Jan 6 13:56:25 2002
2650@@ -31,20 +31,7 @@
2651 if (child->state != TASK_STOPPED)
2652 return -ESRCH;
2653 #ifdef CONFIG_SMP
2654- /* Make sure the child gets off its CPU.. */
2655- for (;;) {
2656- task_lock(child);
2657- if (!task_has_cpu(child))
2658- break;
2659- task_unlock(child);
2660- do {
2661- if (child->state != TASK_STOPPED)
2662- return -ESRCH;
2663- barrier();
2664- cpu_relax();
2665- } while (task_has_cpu(child));
2666- }
2667- task_unlock(child);
2668+ wait_task_inactive(child);
2669 #endif
2670 }
2671
2672--- linux/kernel/sys.c.orig Sun Jan 6 13:55:47 2002
2673+++ linux/kernel/sys.c Sun Jan 6 13:56:25 2002
2674@@ -220,10 +220,10 @@
2675 }
2676 if (error == -ESRCH)
2677 error = 0;
2678- if (niceval < p->nice && !capable(CAP_SYS_NICE))
2679+ if (niceval < p->__nice && !capable(CAP_SYS_NICE))
2680 error = -EACCES;
2681 else
2682- p->nice = niceval;
2683+ set_user_nice(p, niceval);
2684 }
2685 read_unlock(&tasklist_lock);
2686
2687@@ -249,7 +249,7 @@
2688 long niceval;
2689 if (!proc_sel(p, which, who))
2690 continue;
2691- niceval = 20 - p->nice;
2692+ niceval = 20 - p->__nice;
2693 if (niceval > retval)
2694 retval = niceval;
2695 }
2696--- linux/kernel/signal.c.orig Sun Jan 6 13:55:56 2002
2697+++ linux/kernel/signal.c Sun Jan 6 13:56:25 2002
2698@@ -478,12 +478,9 @@
2699 * process of changing - but no harm is done by that
2700 * other than doing an extra (lightweight) IPI interrupt.
2701 */
2702- spin_lock(&runqueue_lock);
2703- if (task_has_cpu(t) && t->processor != smp_processor_id())
2704- smp_send_reschedule(t->processor);
2705- spin_unlock(&runqueue_lock);
2706-#endif /* CONFIG_SMP */
2707-
2708+ if ((t->state == TASK_RUNNING) && (t->cpu != cpu()))
2709+ kick_if_running(t);
2710+#endif
2711 if (t->state & TASK_INTERRUPTIBLE) {
2712 wake_up_process(t);
2713 return;
2714--- linux/kernel/printk.c.orig Sun Jan 6 13:55:57 2002
2715+++ linux/kernel/printk.c Sun Jan 6 13:56:25 2002
2716@@ -25,6 +25,7 @@
2717 #include <linux/module.h>
2718 #include <linux/interrupt.h> /* For in_interrupt() */
2719 #include <linux/config.h>
2720+#include <linux/delay.h>
2721
2722 #include <asm/uaccess.h>
2723
2724--- linux/kernel/ksyms.c.orig Sun Jan 6 13:55:57 2002
2725+++ linux/kernel/ksyms.c Thu Jan 10 22:55:43 2002
2726@@ -437,6 +437,9 @@
2727 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
2728 EXPORT_SYMBOL(schedule);
2729 EXPORT_SYMBOL(schedule_timeout);
2730+EXPORT_SYMBOL(sys_sched_yield);
2731+EXPORT_SYMBOL(set_user_nice);
2732+EXPORT_SYMBOL(set_cpus_allowed);
2733 EXPORT_SYMBOL(jiffies);
2734 EXPORT_SYMBOL(xtime);
2735 EXPORT_SYMBOL(do_gettimeofday);
2736@@ -448,6 +451,7 @@
2737
2738 EXPORT_SYMBOL(kstat);
2739 EXPORT_SYMBOL(nr_running);
2740+EXPORT_SYMBOL(nr_context_switches);
2741
2742 /* misc */
2743 EXPORT_SYMBOL(panic);
2744--- linux/mm/oom_kill.c.orig Sun Jan 6 13:55:53 2002
2745+++ linux/mm/oom_kill.c Sun Jan 6 13:56:25 2002
2746@@ -82,7 +82,7 @@
2747 * Niced processes are most likely less important, so double
2748 * their badness points.
2749 */
2750- if (p->nice > 0)
2751+ if (p->__nice > 0)
2752 points *= 2;
2753
2754 /*
2755@@ -149,7 +149,7 @@
2756 * all the memory it needs. That way it should be able to
2757 * exit() and clear out its resources quickly...
2758 */
2759- p->counter = 5 * HZ;
2760+ p->time_slice = 2 * MAX_TIMESLICE;
2761 p->flags |= PF_MEMALLOC | PF_MEMDIE;
2762
2763 /* This process has hardware access, be more careful. */
2764@@ -188,8 +188,7 @@
2765 * killing itself before someone else gets the chance to ask
2766 * for more memory.
2767 */
2768- current->policy |= SCHED_YIELD;
2769- schedule();
2770+ yield();
2771 return;
2772 }
2773
2774--- linux/mm/page_alloc.c.orig Sun Jan 6 13:55:56 2002
2775+++ linux/mm/page_alloc.c Fri Jan 25 14:26:36 2002
2776@@ -394,9 +394,8 @@
2777 return NULL;
2778
2779 /* Yield for kswapd, and try again */
2780- current->policy |= SCHED_YIELD;
2781 __set_current_state(TASK_RUNNING);
2782- schedule();
2783+ yield();
2784 goto rebalance;
2785 }
2786
2787--- linux/mm/highmem.c.orig Sun Jan 6 13:55:57 2002
2788+++ linux/mm/highmem.c Fri Jan 25 14:26:56 2002
2789@@ -354,9 +354,8 @@
2790 /* we need to wait I/O completion */
2791 run_task_queue(&tq_disk);
2792
2793- current->policy |= SCHED_YIELD;
2794 __set_current_state(TASK_RUNNING);
2795- schedule();
2796+ yield();
2797 goto repeat_alloc;
2798 }
2799
2800@@ -392,9 +391,8 @@
2801 /* we need to wait I/O completion */
2802 run_task_queue(&tq_disk);
2803
2804- current->policy |= SCHED_YIELD;
2805 __set_current_state(TASK_RUNNING);
2806- schedule();
2807+ yield();
2808 goto repeat_alloc;
2809 }
2810
2811--- linux/include/linux/sched.h.orig Sun Jan 6 13:55:57 2002
2812+++ linux/include/linux/sched.h Mon Jan 28 18:48:01 2002
2813@@ -6,6 +6,7 @@
2814 extern unsigned long event;
2815
2816 #include <linux/config.h>
2817+#include <linux/compiler.h>
2818 #include <linux/binfmts.h>
2819 #include <linux/threads.h>
2820 #include <linux/kernel.h>
2821@@ -42,6 +43,7 @@
2822 #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
2823 #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
2824 #define CLONE_THREAD 0x00010000 /* Same thread group? */
2825+#define CLONE_NEWNS 0x00020000 /* New namespace group? */
2826
2827 #define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD)
2828
2829@@ -72,8 +74,9 @@
2830 #define CT_TO_SECS(x) ((x) / HZ)
2831 #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ)
2832
2833-extern int nr_running, nr_threads;
2834+extern int nr_threads;
2835 extern int last_pid;
2836+extern unsigned long nr_running(void);
2837
2838 #include <linux/fs.h>
2839 #include <linux/time.h>
2840@@ -116,12 +119,6 @@
2841 #define SCHED_FIFO 1
2842 #define SCHED_RR 2
2843
2844-/*
2845- * This is an additional bit set when we want to
2846- * yield the CPU for one re-schedule..
2847- */
2848-#define SCHED_YIELD 0x10
2849-
2850 struct sched_param {
2851 int sched_priority;
2852 };
2853@@ -139,17 +136,22 @@
2854 * a separate lock).
2855 */
2856 extern rwlock_t tasklist_lock;
2857-extern spinlock_t runqueue_lock;
2858 extern spinlock_t mmlist_lock;
2859
2860+typedef struct task_struct task_t;
2861+
2862 extern void sched_init(void);
2863-extern void init_idle(void);
2864+extern void init_idle(task_t *idle, int cpu);
2865 extern void show_state(void);
2866 extern void cpu_init (void);
2867 extern void trap_init(void);
2868 extern void update_process_times(int user);
2869-extern void update_one_process(struct task_struct *p, unsigned long user,
2870+extern void update_one_process(task_t *p, unsigned long user,
2871 unsigned long system, int cpu);
2872+extern void scheduler_tick(task_t *p);
2873+extern void sched_task_migrated(task_t *p);
2874+extern void smp_migrate_task(int cpu, task_t *task);
2875+extern unsigned long cache_decay_ticks;
2876
2877 #define MAX_SCHEDULE_TIMEOUT LONG_MAX
2878 extern signed long FASTCALL(schedule_timeout(signed long timeout));
2879@@ -166,6 +168,7 @@
2880 */
2881 #define NR_OPEN_DEFAULT BITS_PER_LONG
2882
2883+struct namespace;
2884 /*
2885 * Open file table structure
2886 */
2887@@ -278,6 +281,8 @@
2888 extern struct user_struct root_user;
2889 #define INIT_USER (&root_user)
2890
2891+typedef struct prio_array prio_array_t;
2892+
2893 struct task_struct {
2894 /*
2895 * offsets of these are hardcoded elsewhere - touch with care
2896@@ -295,35 +300,28 @@
2897
2898 int lock_depth; /* Lock depth */
2899
2900-/*
2901- * offset 32 begins here on 32-bit platforms. We keep
2902- * all fields in a single cacheline that are needed for
2903- * the goodness() loop in schedule().
2904- */
2905- long counter;
2906- long nice;
2907- unsigned long policy;
2908- struct mm_struct *mm;
2909- int processor;
2910 /*
2911- * cpus_runnable is ~0 if the process is not running on any
2912- * CPU. It's (1 << cpu) if it's running on a CPU. This mask
2913- * is updated under the runqueue lock.
2914- *
2915- * To determine whether a process might run on a CPU, this
2916- * mask is AND-ed with cpus_allowed.
2917+ * offset 32 begins here on 32-bit platforms.
2918 */
2919- unsigned long cpus_runnable, cpus_allowed;
2920- /*
2921- * (only the 'next' pointer fits into the cacheline, but
2922- * that's just fine.)
2923- */
2924- struct list_head run_list;
2925- unsigned long sleep_time;
2926+ unsigned int cpu;
2927+ int prio;
2928+ long __nice;
2929+ list_t run_list;
2930+ prio_array_t *array;
2931+
2932+ unsigned int time_slice;
2933
2934- struct task_struct *next_task, *prev_task;
2935- struct mm_struct *active_mm;
2936+ unsigned long sleep_avg;
2937+ unsigned long sleep_timestamp;
2938+
2939+ unsigned long policy;
2940+ unsigned long cpus_allowed;
2941+
2942+ task_t *next_task, *prev_task;
2943+
2944+ struct mm_struct *mm, *active_mm;
2945 struct list_head local_pages;
2946+
2947 unsigned int allocation_order, nr_local_pages;
2948
2949 /* task state */
2950@@ -345,12 +343,12 @@
2951 * older sibling, respectively. (p->father can be replaced with
2952 * p->p_pptr->pid)
2953 */
2954- struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
2955+ task_t *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
2956 struct list_head thread_group;
2957
2958 /* PID hash table linkage. */
2959- struct task_struct *pidhash_next;
2960- struct task_struct **pidhash_pprev;
2961+ task_t *pidhash_next;
2962+ task_t **pidhash_pprev;
2963
2964 wait_queue_head_t wait_chldexit; /* for wait4() */
2965 struct completion *vfork_done; /* for vfork() */
2966@@ -389,6 +387,8 @@
2967 struct fs_struct *fs;
2968 /* open file information */
2969 struct files_struct *files;
2970+/* namespace */
2971+ struct namespace *namespace;
2972 /* signal handlers */
2973 spinlock_t sigmask_lock; /* Protects signal and blocked */
2974 struct signal_struct *sig;
2975@@ -446,10 +446,66 @@
2976 */
2977 #define _STK_LIM (8*1024*1024)
2978
2979-#define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */
2980-#define MAX_COUNTER (20*HZ/100)
2981-#define DEF_NICE (0)
2982+/*
2983+ * RT priorites go from 0 to 99, but internally we max
2984+ * them out at 128 to make it easier to search the
2985+ * scheduler bitmap.
2986+ */
2987+#define MAX_RT_PRIO 128
2988+/*
2989+ * The lower the priority of a process, the more likely it is
2990+ * to run. Priority of a process goes from 0 to 167. The 0-99
2991+ * priority range is allocated to RT tasks, the 128-167 range
2992+ * is for SCHED_OTHER tasks.
2993+ */
2994+#define MAX_PRIO (MAX_RT_PRIO + 40)
2995+
2996+/*
2997+ * Scales user-nice values [ -20 ... 0 ... 19 ]
2998+ * to static priority [ 128 ... 167 (MAX_PRIO-1) ]
2999+ *
3000+ * User-nice value of -20 == static priority 128, and
3001+ * user-nice value 19 == static priority 167. The lower
3002+ * the priority value, the higher the task's priority.
3003+ */
3004+#define NICE_TO_PRIO(n) (MAX_RT_PRIO + (n) + 20)
3005+#define DEF_USER_NICE 0
3006+
3007+/*
3008+ * Default timeslice is 250 msecs, maximum is 500 msecs.
3009+ * Minimum timeslice is 10 msecs.
3010+ */
3011+#define MIN_TIMESLICE ( 10 * HZ / 1000)
3012+#define MAX_TIMESLICE (300 * HZ / 1000)
3013+#define CHILD_FORK_PENALTY 95
3014+#define PARENT_FORK_PENALTY 100
3015+#define EXIT_WEIGHT 3
3016+#define PRIO_INTERACTIVE_RATIO 20
3017+#define PRIO_CPU_HOG_RATIO 60
3018+#define PRIO_BONUS_RATIO 70
3019+#define INTERACTIVE_DELTA 3
3020+#define MAX_SLEEP_AVG (2*HZ)
3021+#define STARVATION_LIMIT (2*HZ)
3022+
3023+#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
3024+#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
3025+
3026+/*
3027+ * NICE_TO_TIMESLICE scales nice values [ -20 ... 19 ]
3028+ * to time slice values.
3029+ *
3030+ * The higher a process's priority, the bigger timeslices
3031+ * it gets during one round of execution. But even the lowest
3032+ * priority process gets MIN_TIMESLICE worth of execution time.
3033+ */
3034
3035+#define NICE_TO_TIMESLICE(n) (MIN_TIMESLICE + \
3036+ ((MAX_TIMESLICE - MIN_TIMESLICE) * (19-(n))) / 39)
3037+
3038+extern void set_cpus_allowed(task_t *p, unsigned long new_mask);
3039+extern void set_user_nice(task_t *p, long nice);
3040+asmlinkage long sys_sched_yield(void);
3041+#define yield() sys_sched_yield()
3042
3043 /*
3044 * The default (Linux) execution domain.
3045@@ -468,14 +524,13 @@
3046 addr_limit: KERNEL_DS, \
3047 exec_domain: &default_exec_domain, \
3048 lock_depth: -1, \
3049- counter: DEF_COUNTER, \
3050- nice: DEF_NICE, \
3051+ __nice: DEF_USER_NICE, \
3052 policy: SCHED_OTHER, \
3053+ cpus_allowed: -1, \
3054 mm: NULL, \
3055 active_mm: &init_mm, \
3056- cpus_runnable: -1, \
3057- cpus_allowed: -1, \
3058 run_list: LIST_HEAD_INIT(tsk.run_list), \
3059+ time_slice: HZ, \
3060 next_task: &tsk, \
3061 prev_task: &tsk, \
3062 p_opptr: &tsk, \
3063@@ -509,24 +564,24 @@
3064 #endif
3065
3066 union task_union {
3067- struct task_struct task;
3068+ task_t task;
3069 unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
3070 };
3071
3072 extern union task_union init_task_union;
3073
3074 extern struct mm_struct init_mm;
3075-extern struct task_struct *init_tasks[NR_CPUS];
3076+extern task_t *init_tasks[NR_CPUS];
3077
3078 /* PID hashing. (shouldnt this be dynamic?) */
3079 #define PIDHASH_SZ (4096 >> 2)
3080-extern struct task_struct *pidhash[PIDHASH_SZ];
3081+extern task_t *pidhash[PIDHASH_SZ];
3082
3083 #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
3084
3085-static inline void hash_pid(struct task_struct *p)
3086+static inline void hash_pid(task_t *p)
3087 {
3088- struct task_struct **htable = &pidhash[pid_hashfn(p->pid)];
3089+ task_t **htable = &pidhash[pid_hashfn(p->pid)];
3090
3091 if((p->pidhash_next = *htable) != NULL)
3092 (*htable)->pidhash_pprev = &p->pidhash_next;
3093@@ -534,16 +589,16 @@
3094 p->pidhash_pprev = htable;
3095 }
3096
3097-static inline void unhash_pid(struct task_struct *p)
3098+static inline void unhash_pid(task_t *p)
3099 {
3100 if(p->pidhash_next)
3101 p->pidhash_next->pidhash_pprev = p->pidhash_pprev;
3102 *p->pidhash_pprev = p->pidhash_next;
3103 }
3104
3105-static inline struct task_struct *find_task_by_pid(int pid)
3106+static inline task_t *find_task_by_pid(int pid)
3107 {
3108- struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)];
3109+ task_t *p, **htable = &pidhash[pid_hashfn(pid)];
3110
3111 for(p = *htable; p && p->pid != pid; p = p->pidhash_next)
3112 ;
3113@@ -551,19 +606,6 @@
3114 return p;
3115 }
3116
3117-#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL)
3118-
3119-static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu)
3120-{
3121- tsk->processor = cpu;
3122- tsk->cpus_runnable = 1UL << cpu;
3123-}
3124-
3125-static inline void task_release_cpu(struct task_struct *tsk)
3126-{
3127- tsk->cpus_runnable = ~0UL;
3128-}
3129-
3130 /* per-UID process charging. */
3131 extern struct user_struct * alloc_uid(uid_t);
3132 extern void free_uid(struct user_struct *);
3133@@ -590,7 +632,8 @@
3134 extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
3135 extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
3136 signed long timeout));
3137-extern int FASTCALL(wake_up_process(struct task_struct * tsk));
3138+extern int FASTCALL(wake_up_process(task_t * tsk));
3139+extern void FASTCALL(wake_up_forked_process(task_t * tsk));
3140
3141 #define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
3142 #define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
3143@@ -608,28 +651,28 @@
3144 extern int in_egroup_p(gid_t);
3145
3146 extern void proc_caches_init(void);
3147-extern void flush_signals(struct task_struct *);
3148-extern void flush_signal_handlers(struct task_struct *);
3149+extern void flush_signals(task_t *);
3150+extern void flush_signal_handlers(task_t *);
3151 extern int dequeue_signal(sigset_t *, siginfo_t *);
3152 extern void block_all_signals(int (*notifier)(void *priv), void *priv,
3153 sigset_t *mask);
3154 extern void unblock_all_signals(void);
3155-extern int send_sig_info(int, struct siginfo *, struct task_struct *);
3156-extern int force_sig_info(int, struct siginfo *, struct task_struct *);
3157+extern int send_sig_info(int, struct siginfo *, task_t *);
3158+extern int force_sig_info(int, struct siginfo *, task_t *);
3159 extern int kill_pg_info(int, struct siginfo *, pid_t);
3160 extern int kill_sl_info(int, struct siginfo *, pid_t);
3161 extern int kill_proc_info(int, struct siginfo *, pid_t);
3162-extern void notify_parent(struct task_struct *, int);
3163-extern void do_notify_parent(struct task_struct *, int);
3164-extern void force_sig(int, struct task_struct *);
3165-extern int send_sig(int, struct task_struct *, int);
3166+extern void notify_parent(task_t *, int);
3167+extern void do_notify_parent(task_t *, int);
3168+extern void force_sig(int, task_t *);
3169+extern int send_sig(int, task_t *, int);
3170 extern int kill_pg(pid_t, int, int);
3171 extern int kill_sl(pid_t, int, int);
3172 extern int kill_proc(pid_t, int, int);
3173 extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *);
3174 extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long);
3175
3176-static inline int signal_pending(struct task_struct *p)
3177+static inline int signal_pending(task_t *p)
3178 {
3179 return (p->sigpending != 0);
3180 }
3181@@ -668,7 +711,7 @@
3182 This is required every time the blocked sigset_t changes.
3183 All callers should have t->sigmask_lock. */
3184
3185-static inline void recalc_sigpending(struct task_struct *t)
3186+static inline void recalc_sigpending(task_t *t)
3187 {
3188 t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
3189 }
3190@@ -775,16 +818,17 @@
3191 extern int expand_fdset(struct files_struct *, int nr);
3192 extern void free_fdset(fd_set *, int);
3193
3194-extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
3195+extern int copy_thread(int, unsigned long, unsigned long, unsigned long, task_t *, struct pt_regs *);
3196 extern void flush_thread(void);
3197 extern void exit_thread(void);
3198
3199-extern void exit_mm(struct task_struct *);
3200-extern void exit_files(struct task_struct *);
3201-extern void exit_sighand(struct task_struct *);
3202+extern void exit_mm(task_t *);
3203+extern void exit_files(task_t *);
3204+extern void exit_sighand(task_t *);
3205
3206 extern void reparent_to_init(void);
3207 extern void daemonize(void);
3208+extern task_t *child_reaper;
3209
3210 extern int do_execve(char *, char **, char **, struct pt_regs *);
3211 extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long);
3212@@ -793,6 +837,9 @@
3213 extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
3214 extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
3215
3216+extern void wait_task_inactive(task_t * p);
3217+extern void kick_if_running(task_t * p);
3218+
3219 #define __wait_event(wq, condition) \
3220 do { \
3221 wait_queue_t __wait; \
3222@@ -871,24 +918,10 @@
3223 for (p = &init_task ; (p = p->next_task) != &init_task ; )
3224
3225 #define next_thread(p) \
3226- list_entry((p)->thread_group.next, struct task_struct, thread_group)
3227-
3228-static inline void del_from_runqueue(struct task_struct * p)
3229-{
3230- nr_running--;
3231- p->sleep_time = jiffies;
3232- list_del(&p->run_list);
3233- p->run_list.next = NULL;
3234-}
3235-
3236-static inline int task_on_runqueue(struct task_struct *p)
3237-{
3238- return (p->run_list.next != NULL);
3239-}
3240+ list_entry((p)->thread_group.next, task_t, thread_group)
3241
3242-static inline void unhash_process(struct task_struct *p)
3243+static inline void unhash_process(task_t *p)
3244 {
3245- if (task_on_runqueue(p)) BUG();
3246 write_lock_irq(&tasklist_lock);
3247 nr_threads--;
3248 unhash_pid(p);
3249@@ -898,12 +931,12 @@
3250 }
3251
3252 /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */
3253-static inline void task_lock(struct task_struct *p)
3254+static inline void task_lock(task_t *p)
3255 {
3256 spin_lock(&p->alloc_lock);
3257 }
3258
3259-static inline void task_unlock(struct task_struct *p)
3260+static inline void task_unlock(task_t *p)
3261 {
3262 spin_unlock(&p->alloc_lock);
3263 }
3264--- linux/include/linux/list.h.orig Sun Jan 6 13:55:57 2002
3265+++ linux/include/linux/list.h Mon Jan 28 18:48:00 2002
3266@@ -19,6 +19,8 @@
3267 struct list_head *next, *prev;
3268 };
3269
3270+typedef struct list_head list_t;
3271+
3272 #define LIST_HEAD_INIT(name) { &(name), &(name) }
3273
3274 #define LIST_HEAD(name) \
3275--- linux/include/linux/kernel_stat.h.orig Tue Aug 21 14:26:23 2001
3276+++ linux/include/linux/kernel_stat.h Mon Jan 28 18:48:00 2002
3277@@ -32,10 +32,11 @@
3278 unsigned int ipackets, opackets;
3279 unsigned int ierrors, oerrors;
3280 unsigned int collisions;
3281- unsigned int context_swtch;
3282 };
3283
3284 extern struct kernel_stat kstat;
3285+
3286+extern unsigned long nr_context_switches(void);
3287
3288 #if !defined(CONFIG_ARCH_S390)
3289 /*
3290--- linux/include/linux/smp.h.orig Sun Dec 31 20:10:17 2000
3291+++ linux/include/linux/smp.h Mon Jan 28 18:48:00 2002
3292@@ -86,6 +86,14 @@
3293 #define cpu_number_map(cpu) 0
3294 #define smp_call_function(func,info,retry,wait) ({ 0; })
3295 #define cpu_online_map 1
3296+static inline void smp_send_reschedule(int cpu) { }
3297+static inline void smp_send_reschedule_all(void) { }
3298
3299 #endif
3300+
3301+/*
3302+ * Common definitions:
3303+ */
3304+#define cpu() smp_processor_id()
3305+
3306 #endif
3307--- linux/include/asm-i386/smp.h.orig Sun Jan 6 13:55:57 2002
3308+++ linux/include/asm-i386/smp.h Mon Jan 28 18:48:00 2002
3309@@ -63,6 +63,7 @@
3310 extern void smp_flush_tlb(void);
3311 extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
3312 extern void smp_send_reschedule(int cpu);
3313+extern void smp_send_reschedule_all(void);
3314 extern void smp_invalidate_rcv(void); /* Process an NMI */
3315 extern void (*mtrr_hook) (void);
3316 extern void zap_low_mappings (void);
3317@@ -104,7 +105,7 @@
3318 * so this is correct in the x86 case.
3319 */
3320
3321-#define smp_processor_id() (current->processor)
3322+#define smp_processor_id() (current->cpu)
3323
3324 static __inline int hard_smp_processor_id(void)
3325 {
3326@@ -121,18 +122,6 @@
3327 #endif /* !__ASSEMBLY__ */
3328
3329 #define NO_PROC_ID 0xFF /* No processor magic marker */
3330-
3331-/*
3332- * This magic constant controls our willingness to transfer
3333- * a process across CPUs. Such a transfer incurs misses on the L1
3334- * cache, and on a P6 or P5 with multiple L2 caches L2 hits. My
3335- * gut feeling is this will vary by board in value. For a board
3336- * with separate L2 cache it probably depends also on the RSS, and
3337- * for a board with shared L2 cache it ought to decay fast as other
3338- * processes are run.
3339- */
3340-
3341-#define PROC_CHANGE_PENALTY 15 /* Schedule penalty */
3342
3343 #endif
3344 #endif
3345--- linux/include/asm-i386/bitops.h.orig Tue Aug 21 14:26:16 2001
3346+++ linux/include/asm-i386/bitops.h Mon Jan 28 18:48:00 2002
3347@@ -75,6 +75,14 @@
3348 :"=m" (ADDR)
3349 :"Ir" (nr));
3350 }
3351+
3352+static __inline__ void __clear_bit(int nr, volatile void * addr)
3353+{
3354+ __asm__ __volatile__(
3355+ "btrl %1,%0"
3356+ :"=m" (ADDR)
3357+ :"Ir" (nr));
3358+}
3359 #define smp_mb__before_clear_bit() barrier()
3360 #define smp_mb__after_clear_bit() barrier()
3361
3362@@ -284,6 +292,34 @@
3363 }
3364
3365 /**
3366+ * find_first_bit - find the first set bit in a memory region
3367+ * @addr: The address to start the search at
3368+ * @size: The maximum size to search
3369+ *
3370+ * Returns the bit-number of the first set bit, not the number of the byte
3371+ * containing a bit.
3372+ */
3373+static __inline__ int find_first_bit(void * addr, unsigned size)
3374+{
3375+ int d0, d1;
3376+ int res;
3377+
3378+ /* This looks at memory. Mark it volatile to tell gcc not to move it around */
3379+ __asm__ __volatile__(
3380+ "xorl %%eax,%%eax\n\t"
3381+ "repe; scasl\n\t"
3382+ "jz 1f\n\t"
3383+ "leal -4(%%edi),%%edi\n\t"
3384+ "bsfl (%%edi),%%eax\n"
3385+ "1:\tsubl %%ebx,%%edi\n\t"
3386+ "shll $3,%%edi\n\t"
3387+ "addl %%edi,%%eax"
3388+ :"=a" (res), "=&c" (d0), "=&D" (d1)
3389+ :"1" ((size + 31) >> 5), "2" (addr), "b" (addr));
3390+ return res;
3391+}
3392+
3393+/**
3394 * find_next_zero_bit - find the first zero bit in a memory region
3395 * @addr: The address to base the search on
3396 * @offset: The bitnumber to start searching at
3397@@ -296,7 +332,7 @@
3398
3399 if (bit) {
3400 /*
3401- * Look for zero in first byte
3402+ * Look for zero in the first 32 bits.
3403 */
3404 __asm__("bsfl %1,%0\n\t"
3405 "jne 1f\n\t"
3406@@ -317,6 +353,39 @@
3407 }
3408
3409 /**
3410+ * find_next_bit - find the first set bit in a memory region
3411+ * @addr: The address to base the search on
3412+ * @offset: The bitnumber to start searching at
3413+ * @size: The maximum size to search
3414+ */
3415+static __inline__ int find_next_bit (void * addr, int size, int offset)
3416+{
3417+ unsigned long * p = ((unsigned long *) addr) + (offset >> 5);
3418+ int set = 0, bit = offset & 31, res;
3419+
3420+ if (bit) {
3421+ /*
3422+ * Look for nonzero in the first 32 bits:
3423+ */
3424+ __asm__("bsfl %1,%0\n\t"
3425+ "jne 1f\n\t"
3426+ "movl $32, %0\n"
3427+ "1:"
3428+ : "=r" (set)
3429+ : "r" (*p >> bit));
3430+ if (set < (32 - bit))
3431+ return set + offset;
3432+ set = 32 - bit;
3433+ p++;
3434+ }
3435+ /*
3436+ * No set bit yet, search remaining full words for a bit
3437+ */
3438+ res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr));
3439+ return (offset + set + res);
3440+}
3441+
3442+/**
3443 * ffz - find first zero in word.
3444 * @word: The word to search
3445 *
3446@@ -327,6 +396,20 @@
3447 __asm__("bsfl %1,%0"
3448 :"=r" (word)
3449 :"r" (~word));
3450+ return word;
3451+}
3452+
3453+/**
3454+ * __ffs - find first bit in word.
3455+ * @word: The word to search
3456+ *
3457+ * Undefined if no bit exists, so code should check against 0 first.
3458+ */
3459+static __inline__ unsigned long __ffs(unsigned long word)
3460+{
3461+ __asm__("bsfl %1,%0"
3462+ :"=r" (word)
3463+ :"rm" (word));
3464 return word;
3465 }
3466
3467--- linux/include/asm-i386/pgalloc.h.orig Sun Jan 6 13:55:57 2002
3468+++ linux/include/asm-i386/pgalloc.h Mon Jan 28 18:48:00 2002
3469@@ -224,6 +224,7 @@
3470 {
3471 struct mm_struct *active_mm;
3472 int state;
3473+ char __cacheline_padding[24];
3474 };
3475 extern struct tlb_state cpu_tlbstate[NR_CPUS];
3476
3477--- linux/include/asm-i386/mmu_context.h.orig Tue Aug 21 14:26:23 2001
3478+++ linux/include/asm-i386/mmu_context.h Mon Jan 28 18:48:00 2002
3479@@ -7,6 +7,31 @@
3480 #include <asm/pgalloc.h>
3481
3482 /*
3483+ * Every architecture must define this function. It's the fastest
3484+ * way of searching a 168-bit bitmap where the first 128 bits are
3485+ * unlikely to be set. It's guaranteed that at least one of the 168
3486+ * bits is cleared.
3487+ */
3488+#if MAX_RT_PRIO != 128 || MAX_PRIO != 168
3489+# error update this function.
3490+#endif
3491+
3492+static inline int sched_find_first_bit(unsigned long *b)
3493+{
3494+ if (unlikely(b[0]))
3495+ return __ffs(b[0]);
3496+ if (unlikely(b[1]))
3497+ return __ffs(b[1]) + 32;
3498+ if (unlikely(b[2]))
3499+ return __ffs(b[2]) + 64;
3500+ if (unlikely(b[3]))
3501+ return __ffs(b[3]) + 96;
3502+ if (b[4])
3503+ return __ffs(b[4]) + 128;
3504+ return __ffs(b[5]) + 32 + 128;
3505+}
3506+
3507+/*
3508 * possibly do the LDT unload here?
3509 */
3510 #define destroy_context(mm) do { } while(0)
3511@@ -27,13 +52,13 @@
3512
3513 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu)
3514 {
3515- if (prev != next) {
3516+ if (likely(prev != next)) {
3517 /* stop flush ipis for the previous mm */
3518 clear_bit(cpu, &prev->cpu_vm_mask);
3519 /*
3520 * Re-load LDT if necessary
3521 */
3522- if (prev->context.segments != next->context.segments)
3523+ if (unlikely(prev->context.segments != next->context.segments))
3524 load_LDT(next);
3525 #ifdef CONFIG_SMP
3526 cpu_tlbstate[cpu].state = TLBSTATE_OK;
3527--- linux/include/asm-i386/hw_irq.h.orig Wed Jan 16 21:44:01 2002
3528+++ linux/include/asm-i386/hw_irq.h Mon Jan 28 18:48:00 2002
3529@@ -41,8 +41,9 @@
3530 #define ERROR_APIC_VECTOR 0xfe
3531 #define INVALIDATE_TLB_VECTOR 0xfd
3532 #define RESCHEDULE_VECTOR 0xfc
3533-#define CALL_FUNCTION_VECTOR 0xfb
3534-#define KDB_VECTOR 0xfa
3535+#define TASK_MIGRATION_VECTOR 0xfb
3536+#define CALL_FUNCTION_VECTOR 0xfa
3537+#define KDB_VECTOR 0xf9
3538
3539 /*
3540 * Local APIC timer IRQ vector is on a different priority level,
3541--- linux/include/asm-i386/apic.h.orig Mon Jan 28 18:05:10 2002
3542+++ linux/include/asm-i386/apic.h Mon Jan 28 18:48:00 2002
3543@@ -79,6 +79,8 @@
3544 extern void setup_apic_nmi_watchdog (void);
3545 extern inline void nmi_watchdog_tick (struct pt_regs * regs);
3546 extern int APIC_init_uniprocessor (void);
3547+extern void disable_APIC_timer(void);
3548+extern void enable_APIC_timer(void);
3549
3550 extern struct pm_dev *apic_pm_register(pm_dev_t, unsigned long, pm_callback);
3551 extern void apic_pm_unregister(struct pm_dev*);
3552--- linux/net/unix/af_unix.c.orig Sun Jan 6 13:55:58 2002
3553+++ linux/net/unix/af_unix.c Sun Jan 6 13:56:25 2002
3554@@ -564,10 +564,8 @@
3555 addr->hash)) {
3556 write_unlock(&unix_table_lock);
3557 /* Sanity yield. It is unusual case, but yet... */
3558- if (!(ordernum&0xFF)) {
3559- current->policy |= SCHED_YIELD;
3560- schedule();
3561- }
3562+ if (!(ordernum&0xFF))
3563+ yield();
3564 goto retry;
3565 }
3566 addr->hash ^= sk->type;
3567--- linux/net/ipv4/tcp_output.c.orig Sun Jan 6 13:55:57 2002
3568+++ linux/net/ipv4/tcp_output.c Sun Jan 6 13:56:25 2002
3569@@ -1009,8 +1009,7 @@
3570 skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
3571 if (skb)
3572 break;
3573- current->policy |= SCHED_YIELD;
3574- schedule();
3575+ yield();
3576 }
3577
3578 /* Reserve space for headers and prepare control bits. */
3579--- linux/net/sunrpc/sched.c.orig Sun Jan 6 13:55:52 2002
3580+++ linux/net/sunrpc/sched.c Sun Jan 6 13:56:25 2002
3581@@ -772,8 +772,7 @@
3582 }
3583 if (flags & RPC_TASK_ASYNC)
3584 return NULL;
3585- current->policy |= SCHED_YIELD;
3586- schedule();
3587+ yield();
3588 } while (!signalled());
3589
3590 return NULL;
3591@@ -1114,8 +1113,7 @@
3592 __rpc_schedule();
3593 if (all_tasks) {
3594 dprintk("rpciod_killall: waiting for tasks to exit\n");
3595- current->policy |= SCHED_YIELD;
3596- schedule();
3597+ yield();
3598 }
3599 }
3600
3601@@ -1185,8 +1183,7 @@
3602 * wait briefly before checking the process id.
3603 */
3604 current->sigpending = 0;
3605- current->policy |= SCHED_YIELD;
3606- schedule();
3607+ yield();
3608 /*
3609 * Display a message if we're going to wait longer.
3610 */
3611--- linux/net/sched/sch_generic.c.orig Fri Aug 18 19:26:25 2000
3612+++ linux/net/sched/sch_generic.c Sun Jan 6 13:56:25 2002
3613@@ -475,10 +475,8 @@
3614
3615 dev_watchdog_down(dev);
3616
3617- while (test_bit(__LINK_STATE_SCHED, &dev->state)) {
3618- current->policy |= SCHED_YIELD;
3619- schedule();
3620- }
3621+ while (test_bit(__LINK_STATE_SCHED, &dev->state))
3622+ yield();
3623
3624 spin_unlock_wait(&dev->xmit_lock);
3625 }
3626--- linux/net/socket.c.orig Sun Jan 6 13:55:58 2002
3627+++ linux/net/socket.c Sun Jan 6 13:56:25 2002
3628@@ -148,8 +148,7 @@
3629 while (atomic_read(&net_family_lockct) != 0) {
3630 spin_unlock(&net_family_lock);
3631
3632- current->policy |= SCHED_YIELD;
3633- schedule();
3634+ yield();
3635
3636 spin_lock(&net_family_lock);
3637 }
3638--- linux/drivers/net/slip.c.orig Sun Jan 6 13:55:48 2002
3639+++ linux/drivers/net/slip.c Sun Jan 6 13:56:25 2002
3640@@ -1393,10 +1393,8 @@
3641 /* First of all: check for active disciplines and hangup them.
3642 */
3643 do {
3644- if (busy) {
3645- current->counter = 0;
3646- schedule();
3647- }
3648+ if (busy)
3649+ sys_sched_yield();
3650
3651 busy = 0;
3652 local_bh_disable();
3653--- linux/drivers/block/loop.c.orig Sun Jan 6 13:55:56 2002
3654+++ linux/drivers/block/loop.c Sun Jan 6 13:56:25 2002
3655@@ -570,9 +570,6 @@
3656 flush_signals(current);
3657 spin_unlock_irq(&current->sigmask_lock);
3658
3659- current->policy = SCHED_OTHER;
3660- current->nice = -20;
3661-
3662 spin_lock_irq(&lo->lo_lock);
3663 lo->lo_state = Lo_bound;
3664 atomic_inc(&lo->lo_pending);
3665--- linux/drivers/char/mwave/mwavedd.c.orig Sun Jan 13 16:27:41 2002
3666+++ linux/drivers/char/mwave/mwavedd.c Sun Jan 13 16:28:05 2002
3667@@ -279,7 +279,6 @@
3668 pDrvData->IPCs[ipcnum].bIsHere = FALSE;
3669 pDrvData->IPCs[ipcnum].bIsEnabled = TRUE;
3670 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
3671- current->nice = -20; /* boost to provide priority timing */
3672 #else
3673 current->priority = 0x28; /* boost to provide priority timing */
3674 #endif
3675--- linux/drivers/ide/ataraid.c.orig Sun Jan 6 13:55:52 2002
3676+++ linux/drivers/ide/ataraid.c Fri Jan 25 14:27:38 2002
3677@@ -123,8 +123,7 @@
3678 ptr=kmalloc(sizeof(struct buffer_head),GFP_NOIO);
3679 if (!ptr) {
3680 __set_current_state(TASK_RUNNING);
3681- current->policy |= SCHED_YIELD;
3682- schedule();
3683+ yield();
3684 }
3685 }
3686 return ptr;
3687@@ -139,8 +138,7 @@
3688 ptr=kmalloc(sizeof(struct ataraid_bh_private),GFP_NOIO);
3689 if (!ptr) {
3690 __set_current_state(TASK_RUNNING);
3691- current->policy |= SCHED_YIELD;
3692- schedule();
3693+ yield();
3694 }
3695 }
3696 return ptr;
3697--- linux/drivers/md/md.c.orig Sun Jan 6 13:55:56 2002
3698+++ linux/drivers/md/md.c Sun Jan 6 13:56:25 2002
3699@@ -2930,8 +2930,6 @@
3700 * bdflush, otherwise bdflush will deadlock if there are too
3701 * many dirty RAID5 blocks.
3702 */
3703- current->policy = SCHED_OTHER;
3704- current->nice = -20;
3705 md_unlock_kernel();
3706
3707 complete(thread->event);
3708@@ -3381,11 +3379,6 @@
3709 "(but not more than %d KB/sec) for reconstruction.\n",
3710 sysctl_speed_limit_max);
3711
3712- /*
3713- * Resync has low priority.
3714- */
3715- current->nice = 19;
3716-
3717 is_mddev_idle(mddev); /* this also initializes IO event counters */
3718 for (m = 0; m < SYNC_MARKS; m++) {
3719 mark[m] = jiffies;
3720@@ -3463,16 +3456,13 @@
3721 currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
3722
3723 if (currspeed > sysctl_speed_limit_min) {
3724- current->nice = 19;
3725-
3726 if ((currspeed > sysctl_speed_limit_max) ||
3727 !is_mddev_idle(mddev)) {
3728 current->state = TASK_INTERRUPTIBLE;
3729 md_schedule_timeout(HZ/4);
3730 goto repeat;
3731 }
3732- } else
3733- current->nice = -20;
3734+ }
3735 }
3736 printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
3737 err = 0;
3738--- linux/arch/i386/mm/fault.c.orig Sun Jan 6 13:55:47 2002
3739+++ linux/arch/i386/mm/fault.c Sun Jan 6 13:56:25 2002
3740@@ -88,8 +88,7 @@
3741
3742 out_of_memory:
3743 if (current->pid == 1) {
3744- current->policy |= SCHED_YIELD;
3745- schedule();
3746+ yield();
3747 goto survive;
3748 }
3749 goto bad_area;
3750@@ -344,8 +343,7 @@
3751 out_of_memory:
3752 up_read(&mm->mmap_sem);
3753 if (tsk->pid == 1) {
3754- tsk->policy |= SCHED_YIELD;
3755- schedule();
3756+ yield();
3757 down_read(&mm->mmap_sem);
3758 goto survive;
3759 }
3760--- linux/arch/i386/kernel/smpboot.c.orig Sun Jan 6 13:55:56 2002
3761+++ linux/arch/i386/kernel/smpboot.c Mon Jan 28 18:12:21 2002
3762@@ -308,14 +308,14 @@
3763 if (tsc_values[i] < avg)
3764 realdelta = -realdelta;
3765
3766- printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n",
3767- i, realdelta);
3768+ printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", i, realdelta);
3769 }
3770
3771 sum += delta;
3772 }
3773 if (!buggy)
3774 printk("passed.\n");
3775+ ;
3776 }
3777
3778 static void __init synchronize_tsc_ap (void)
3779@@ -365,7 +365,7 @@
3780 * (This works even if the APIC is not enabled.)
3781 */
3782 phys_id = GET_APIC_ID(apic_read(APIC_ID));
3783- cpuid = current->processor;
3784+ cpuid = cpu();
3785 if (test_and_set_bit(cpuid, &cpu_online_map)) {
3786 printk("huh, phys CPU#%d, CPU#%d already present??\n",
3787 phys_id, cpuid);
3788@@ -435,6 +435,7 @@
3789 */
3790 smp_store_cpu_info(cpuid);
3791
3792+ disable_APIC_timer();
3793 /*
3794 * Allow the master to continue.
3795 */
3796@@ -465,6 +466,7 @@
3797 smp_callin();
3798 while (!atomic_read(&smp_commenced))
3799 rep_nop();
3800+ enable_APIC_timer();
3801 /*
3802 * low-memory mappings have been cleared, flush them from
3803 * the local TLBs too.
3804@@ -803,16 +805,13 @@
3805 if (!idle)
3806 panic("No idle process for CPU %d", cpu);
3807
3808- idle->processor = cpu;
3809- idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */
3810+ init_idle(idle, cpu);
3811
3812 map_cpu_to_boot_apicid(cpu, apicid);
3813
3814 idle->thread.eip = (unsigned long) start_secondary;
3815
3816- del_from_runqueue(idle);
3817 unhash_process(idle);
3818- init_tasks[cpu] = idle;
3819
3820 /* start_eip had better be page-aligned! */
3821 start_eip = setup_trampoline();
3822@@ -925,6 +924,7 @@
3823 }
3824
3825 cycles_t cacheflush_time;
3826+unsigned long cache_decay_ticks;
3827
3828 static void smp_tune_scheduling (void)
3829 {
3830@@ -958,9 +958,13 @@
3831 cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth;
3832 }
3833
3834+ cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000;
3835+
3836 printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
3837 (long)cacheflush_time/(cpu_khz/1000),
3838 ((long)cacheflush_time*100/(cpu_khz/1000)) % 100);
3839+ printk("task migration cache decay timeout: %ld msecs.\n",
3840+ (cache_decay_ticks + 1) * 1000 / HZ);
3841 }
3842
3843 /*
3844@@ -1020,8 +1024,7 @@
3845 map_cpu_to_boot_apicid(0, boot_cpu_apicid);
3846
3847 global_irq_holder = 0;
3848- current->processor = 0;
3849- init_idle();
3850+ current->cpu = 0;
3851 smp_tune_scheduling();
3852
3853 /*
3854--- linux/arch/i386/kernel/process.c.orig Mon Jan 28 18:09:58 2002
3855+++ linux/arch/i386/kernel/process.c Mon Jan 28 18:09:53 2002
3856@@ -123,15 +123,12 @@
3857 void cpu_idle (void)
3858 {
3859 /* endless idle loop with no priority at all */
3860- init_idle();
3861- current->nice = 20;
3862- current->counter = -100;
3863
3864 while (1) {
3865 void (*idle)(void) = pm_idle;
3866 if (!idle)
3867 idle = default_idle;
3868- while (!current->need_resched)
3869+ if (!current->need_resched)
3870 idle();
3871 schedule();
3872 check_pgt_cache();
3873@@ -694,15 +691,17 @@
3874 asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
3875
3876 /*
3877- * Restore %fs and %gs.
3878+ * Restore %fs and %gs if needed.
3879 */
3880- loadsegment(fs, next->fs);
3881- loadsegment(gs, next->gs);
3882+ if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) {
3883+ loadsegment(fs, next->fs);
3884+ loadsegment(gs, next->gs);
3885+ }
3886
3887 /*
3888 * Now maybe reload the debug registers
3889 */
3890- if (next->debugreg[7]){
3891+ if (unlikely(next->debugreg[7])) {
3892 loaddebug(next, 0);
3893 loaddebug(next, 1);
3894 loaddebug(next, 2);
3895@@ -712,7 +711,7 @@
3896 loaddebug(next, 7);
3897 }
3898
3899- if (prev->ioperm || next->ioperm) {
3900+ if (unlikely(prev->ioperm || next->ioperm)) {
3901 if (next->ioperm) {
3902 /*
3903 * 4 cachelines copy ... not good, but not that
3904--- linux/arch/i386/kernel/apic.c.orig Sun Jan 6 13:55:54 2002
3905+++ linux/arch/i386/kernel/apic.c Mon Jan 28 18:07:11 2002
3906@@ -785,8 +785,7 @@
3907 */
3908
3909 slice = clocks / (smp_num_cpus+1);
3910- printk("cpu: %d, clocks: %d, slice: %d\n",
3911- smp_processor_id(), clocks, slice);
3912+ printk("cpu: %d, clocks: %d, slice: %d\n", smp_processor_id(), clocks, slice);
3913
3914 /*
3915 * Wait for IRQ0's slice:
3916@@ -809,8 +808,7 @@
3917
3918 __setup_APIC_LVTT(clocks);
3919
3920- printk("CPU%d<T0:%d,T1:%d,D:%d,S:%d,C:%d>\n",
3921- smp_processor_id(), t0, t1, delta, slice, clocks);
3922+ printk("CPU%d<T0:%d,T1:%d,D:%d,S:%d,C:%d>\n", smp_processor_id(), t0, t1, delta, slice, clocks);
3923
3924 __restore_flags(flags);
3925 }
3926@@ -911,6 +909,26 @@
3927
3928 /* and update all other cpus */
3929 smp_call_function(setup_APIC_timer, (void *)calibration_result, 1, 1);
3930+}
3931+
3932+void __init disable_APIC_timer(void)
3933+{
3934+ if (using_apic_timer) {
3935+ unsigned long v;
3936+
3937+ v = apic_read(APIC_LVTT);
3938+ apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
3939+ }
3940+}
3941+
3942+void enable_APIC_timer(void)
3943+{
3944+ if (using_apic_timer) {
3945+ unsigned long v;
3946+
3947+ v = apic_read(APIC_LVTT);
3948+ apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
3949+ }
3950 }
3951
3952 /*
3953--- linux/arch/i386/kernel/nmi.c.orig Sun Jan 6 13:55:43 2002
3954+++ linux/arch/i386/kernel/nmi.c Sun Jan 6 13:56:25 2002
3955@@ -283,7 +283,7 @@
3956 * to get a message out.
3957 */
3958 bust_spinlocks(1);
3959- printk("NMI Watchdog detected LOCKUP on CPU%d, registers:\n", cpu);
3960+ printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip);
3961 show_registers(regs);
3962 printk("console shuts up ...\n");
3963 console_silent();
3964--- linux/arch/i386/kernel/smp.c.orig Sun Jan 6 13:55:56 2002
3965+++ linux/arch/i386/kernel/smp.c Wed Jan 16 21:42:45 2002
3966@@ -105,7 +105,7 @@
3967 /* The 'big kernel lock' */
3968 spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
3969
3970-struct tlb_state cpu_tlbstate[NR_CPUS] = {[0 ... NR_CPUS-1] = { &init_mm, 0 }};
3971+struct tlb_state cpu_tlbstate[NR_CPUS] __cacheline_aligned = {[0 ... NR_CPUS-1] = { &init_mm, 0, }};
3972
3973 /*
3974 * the following functions deal with sending IPIs between CPUs.
3975@@ -485,15 +485,54 @@
3976 do_flush_tlb_all_local();
3977 }
3978
3979+static spinlock_t migration_lock = SPIN_LOCK_UNLOCKED;
3980+static task_t *new_task;
3981+
3982+/*
3983+ * This function sends a 'task migration' IPI to another CPU.
3984+ * Must be called from syscall contexts, with interrupts *enabled*.
3985+ */
3986+void smp_migrate_task(int cpu, task_t *p)
3987+{
3988+ /*
3989+ * The target CPU will unlock the migration spinlock:
3990+ */
3991+ spin_lock(&migration_lock);
3992+ new_task = p;
3993+ send_IPI_mask(1 << cpu, TASK_MIGRATION_VECTOR);
3994+}
3995+
3996+/*
3997+ * Task migration callback.
3998+ */
3999+asmlinkage void smp_task_migration_interrupt(void)
4000+{
4001+ task_t *p;
4002+
4003+ ack_APIC_irq();
4004+ p = new_task;
4005+ spin_unlock(&migration_lock);
4006+ sched_task_migrated(p);
4007+}
4008 /*
4009 * this function sends a 'reschedule' IPI to another CPU.
4010 * it goes straight through and wastes no time serializing
4011 * anything. Worst case is that we lose a reschedule ...
4012 */
4013-
4014 void smp_send_reschedule(int cpu)
4015 {
4016 send_IPI_mask(1 << cpu, RESCHEDULE_VECTOR);
4017+}
4018+
4019+/*
4020+ * this function sends a reschedule IPI to all (other) CPUs.
4021+ * This should only be used if some 'global' task became runnable,
4022+ * such as a RT task, that must be handled now. The first CPU
4023+ * that manages to grab the task will run it.
4024+ */
4025+void smp_send_reschedule_all(void)
4026+{
4027+ send_IPI_allbutself(RESCHEDULE_VECTOR);
4028 }
4029
4030 /*
4031--- linux/arch/i386/kernel/i8259.c.orig Wed Jan 16 21:43:09 2002
4032+++ linux/arch/i386/kernel/i8259.c Fri Jan 18 15:36:35 2002
4033@@ -79,6 +79,7 @@
4034 * through the ICC by us (IPIs)
4035 */
4036 #ifdef CONFIG_SMP
4037+BUILD_SMP_INTERRUPT(task_migration_interrupt,TASK_MIGRATION_VECTOR)
4038 BUILD_SMP_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
4039 BUILD_SMP_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
4040 BUILD_SMP_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
4041@@ -472,6 +473,9 @@
4042 * IPI, driven by wakeup.
4043 */
4044 set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
4045+
4046+ /* IPI for task migration */
4047+ set_intr_gate(TASK_MIGRATION_VECTOR, task_migration_interrupt);
4048
4049 /* IPI for invalidation */
4050 set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
4051--- linux/arch/i386/kernel/entry.S.orig Fri Jan 25 14:30:36 2002
4052+++ linux/arch/i386/kernel/entry.S Fri Jan 25 14:30:50 2002
4053@@ -77,7 +77,7 @@
4054 exec_domain = 16
4055 need_resched = 20
4056 tsk_ptrace = 24
4057-processor = 52
4058+cpu = 32
4059
4060 ENOSYS = 38
4061
4062--- linux/arch/i386/kernel/setup.c.orig Mon Jan 28 18:10:23 2002
4063+++ linux/arch/i386/kernel/setup.c Mon Jan 28 18:10:48 2002
4064@@ -2922,9 +2922,10 @@
4065 load_TR(nr);
4066 load_LDT(&init_mm);
4067
4068- /*
4069- * Clear all 6 debug registers:
4070- */
4071+ /* Clear %fs and %gs. */
4072+ asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
4073+
4074+ /* Clear all 6 debug registers: */
4075
4076 #define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) );
4077
This page took 5.314753 seconds and 4 git commands to generate.