]> git.pld-linux.org Git - packages/kernel.git/blame - sched-O1-2.4.18-pre8-K3.patch
- obsolete
[packages/kernel.git] / sched-O1-2.4.18-pre8-K3.patch
CommitLineData
109f8d43 1--- linux/fs/proc/proc_misc.c.orig Tue Feb 5 13:51:49 2002
2+++ linux/fs/proc/proc_misc.c Tue Feb 5 13:52:12 2002
3@@ -85,11 +85,11 @@
4 a = avenrun[0] + (FIXED_1/200);
5 b = avenrun[1] + (FIXED_1/200);
6 c = avenrun[2] + (FIXED_1/200);
7- len = sprintf(page,"%d.%02d %d.%02d %d.%02d %d/%d %d\n",
8+ len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
9 LOAD_INT(a), LOAD_FRAC(a),
10 LOAD_INT(b), LOAD_FRAC(b),
11 LOAD_INT(c), LOAD_FRAC(c),
12- nr_running, nr_threads, last_pid);
13+ nr_running(), nr_threads, last_pid);
14 return proc_calc_metrics(page, start, off, count, eof, len);
15 }
16
17@@ -101,7 +101,7 @@
18 int len;
19
20 uptime = jiffies;
21- idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;
22+ idle = init_task.times.tms_utime + init_task.times.tms_stime;
23
24 /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
25 that would overflow about every five days at HZ == 100.
26@@ -303,10 +303,10 @@
27 }
28
29 len += sprintf(page + len,
30- "\nctxt %u\n"
31+ "\nctxt %lu\n"
32 "btime %lu\n"
33 "processes %lu\n",
34- kstat.context_swtch,
35+ nr_context_switches(),
36 xtime.tv_sec - jif / HZ,
37 total_forks);
38
39--- linux/fs/proc/array.c.orig Tue Feb 5 13:51:45 2002
40+++ linux/fs/proc/array.c Tue Feb 5 13:52:12 2002
41@@ -335,9 +335,8 @@
42
43 /* scale priority and nice values from timeslices to -20..20 */
44 /* to make it look like a "normal" Unix priority/nice value */
45- priority = task->counter;
46- priority = 20 - (priority * 10 + DEF_COUNTER / 2) / DEF_COUNTER;
47- nice = task->nice;
48+ priority = task_prio(task);
49+ nice = task_nice(task);
50
51 read_lock(&tasklist_lock);
52 ppid = task->pid ? task->p_opptr->pid : 0;
53@@ -387,7 +386,7 @@
54 task->nswap,
55 task->cnswap,
56 task->exit_signal,
57- task->processor);
58+ task->cpu);
59 if(mm)
60 mmput(mm);
61 return res;
62--- linux/fs/nfs/pagelist.c.orig Tue Feb 5 13:51:50 2002
63+++ linux/fs/nfs/pagelist.c Tue Feb 5 13:52:12 2002
64@@ -96,8 +96,7 @@
65 continue;
66 if (signalled() && (server->flags & NFS_MOUNT_INTR))
67 return ERR_PTR(-ERESTARTSYS);
68- current->policy = SCHED_YIELD;
69- schedule();
70+ yield();
71 }
72
73 /* Initialize the request struct. Initially, we assume a
74--- linux/fs/ufs/truncate.c.orig Tue Feb 5 13:51:53 2002
75+++ linux/fs/ufs/truncate.c Tue Feb 5 13:52:12 2002
76@@ -448,10 +448,7 @@
77 if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
78 ufs_sync_inode (inode);
79 run_task_queue(&tq_disk);
80- current->policy |= SCHED_YIELD;
81- schedule ();
82-
83-
84+ yield();
85 }
86 offset = inode->i_size & uspi->s_fshift;
87 if (offset) {
88--- linux/fs/reiserfs/buffer2.c.orig Tue Feb 5 13:51:51 2002
89+++ linux/fs/reiserfs/buffer2.c Tue Feb 5 13:52:12 2002
90@@ -33,8 +33,7 @@
91 buffer_journal_dirty(bh) ? ' ' : '!');
92 }
93 run_task_queue(&tq_disk);
94- current->policy |= SCHED_YIELD;
95- schedule();
96+ yield();
97 }
98 if (repeat_counter > 30000000) {
99 reiserfs_warning("vs-3051: done waiting, ignore vs-3050 messages for (%b)\n", bh) ;
100@@ -52,11 +51,11 @@
101 struct buffer_head * reiserfs_bread (struct super_block *super, int n_block, int n_size)
102 {
103 struct buffer_head *result;
104- PROC_EXP( unsigned int ctx_switches = kstat.context_swtch );
105+ PROC_EXP( unsigned int ctx_switches = nr_context_switches(); );
106
107 result = bread (super -> s_dev, n_block, n_size);
108 PROC_INFO_INC( super, breads );
109- PROC_EXP( if( kstat.context_swtch != ctx_switches )
110+ PROC_EXP( if( nr_context_switches() != ctx_switches )
111 PROC_INFO_INC( super, bread_miss ) );
112 return result;
113 }
114--- linux/fs/reiserfs/journal.c.orig Tue Feb 5 13:51:53 2002
115+++ linux/fs/reiserfs/journal.c Tue Feb 5 13:52:12 2002
116@@ -149,8 +149,7 @@
117 }
118 bn = allocate_bitmap_node(p_s_sb) ;
119 if (!bn) {
120- current->policy |= SCHED_YIELD ;
121- schedule() ;
122+ yield();
123 goto repeat ;
124 }
125 return bn ;
126--- linux/fs/jffs2/background.c.orig Tue Feb 5 13:51:47 2002
127+++ linux/fs/jffs2/background.c Tue Feb 5 13:52:12 2002
128@@ -106,9 +106,6 @@
129
130 sprintf(current->comm, "jffs2_gcd_mtd%d", c->mtd->index);
131
132- /* FIXME in the 2.2 backport */
133- current->nice = 10;
134-
135 for (;;) {
136 spin_lock_irq(&current->sigmask_lock);
137 siginitsetinv (&current->blocked, sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT));
138--- linux/fs/jbd/journal.c.orig Tue Feb 5 13:51:53 2002
139+++ linux/fs/jbd/journal.c Tue Feb 5 13:52:12 2002
140@@ -460,8 +460,7 @@
141 printk (KERN_NOTICE __FUNCTION__
142 ": ENOMEM at get_unused_buffer_head, "
143 "trying again.\n");
144- current->policy |= SCHED_YIELD;
145- schedule();
146+ yield();
147 }
148 } while (!new_bh);
149 /* keep subsequent assertions sane */
150@@ -1541,8 +1540,7 @@
151 last_warning = jiffies;
152 }
153
154- current->policy |= SCHED_YIELD;
155- schedule();
156+ yield();
157 }
158 }
159
160@@ -1600,8 +1598,7 @@
161 last_warning = jiffies;
162 }
163 while (ret == 0) {
164- current->policy |= SCHED_YIELD;
165- schedule();
166+ yield();
167 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
168 }
169 }
170--- linux/fs/jbd/revoke.c.orig Tue Feb 5 13:51:53 2002
171+++ linux/fs/jbd/revoke.c Tue Feb 5 13:52:12 2002
172@@ -137,8 +137,7 @@
173 if (!journal_oom_retry)
174 return -ENOMEM;
175 jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n");
176- current->policy |= SCHED_YIELD;
177- schedule();
178+ yield();
179 goto repeat;
180 }
181
182--- linux/fs/jbd/transaction.c.orig Tue Feb 5 13:51:53 2002
183+++ linux/fs/jbd/transaction.c Tue Feb 5 13:52:12 2002
184@@ -1379,8 +1379,7 @@
185 do {
186 old_handle_count = transaction->t_handle_count;
187 set_current_state(TASK_RUNNING);
188- current->policy |= SCHED_YIELD;
189- schedule();
190+ yield();
191 } while (old_handle_count != transaction->t_handle_count);
192 }
193
194--- linux/fs/binfmt_elf.c.orig Tue Feb 5 13:51:53 2002
195+++ linux/fs/binfmt_elf.c Tue Feb 5 13:52:12 2002
196@@ -1135,7 +1135,7 @@
197 psinfo.pr_state = i;
198 psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i];
199 psinfo.pr_zomb = psinfo.pr_sname == 'Z';
200- psinfo.pr_nice = current->nice;
201+ psinfo.pr_nice = task_nice(current);
202 psinfo.pr_flag = current->flags;
203 psinfo.pr_uid = NEW_TO_OLD_UID(current->uid);
204 psinfo.pr_gid = NEW_TO_OLD_GID(current->gid);
205--- linux/fs/buffer.c.orig Tue Feb 5 13:51:53 2002
206+++ linux/fs/buffer.c Tue Feb 5 13:52:12 2002
207@@ -735,9 +735,8 @@
208 wakeup_bdflush();
209 try_to_free_pages(zone, GFP_NOFS, 0);
210 run_task_queue(&tq_disk);
211- current->policy |= SCHED_YIELD;
212 __set_current_state(TASK_RUNNING);
213- schedule();
214+ sys_sched_yield();
215 }
216
217 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
218--- linux/fs/locks.c.orig Tue Feb 5 13:51:45 2002
219+++ linux/fs/locks.c Tue Feb 5 13:52:12 2002
220@@ -445,8 +445,7 @@
221 /* Let the blocked process remove waiter from the
222 * block list when it gets scheduled.
223 */
224- current->policy |= SCHED_YIELD;
225- schedule();
226+ yield();
227 } else {
228 /* Remove waiter from the block list, because by the
229 * time it wakes up blocker won't exist any more.
230--- linux/init/main.c.orig Tue Feb 5 13:51:53 2002
231+++ linux/init/main.c Tue Feb 5 13:52:12 2002
232@@ -485,8 +485,6 @@
233 extern void setup_arch(char **);
234 extern void cpu_idle(void);
235
236-unsigned long wait_init_idle;
237-
238 #ifndef CONFIG_SMP
239
240 #ifdef CONFIG_X86_LOCAL_APIC
241@@ -495,34 +493,24 @@
242 APIC_init_uniprocessor();
243 }
244 #else
245-#define smp_init() do { } while (0)
246+#define smp_init() do { } while (0)
247 #endif
248
249 #else
250
251-
252 /* Called by boot processor to activate the rest. */
253 static void __init smp_init(void)
254 {
255 /* Get other processors into their bootup holding patterns. */
256 smp_boot_cpus();
257- wait_init_idle = cpu_online_map;
258- clear_bit(current->processor, &wait_init_idle); /* Don't wait on me! */
259
260 smp_threads_ready=1;
261 smp_commence();
262-
263- /* Wait for the other cpus to set up their idle processes */
264- printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle);
265- while (wait_init_idle) {
266- cpu_relax();
267- barrier();
268- }
269- printk("All processors have done init_idle\n");
270 }
271
272 #endif
273
274+
275 /*
276 * We need to finalize in a non-__init function or else race conditions
277 * between the root thread and the init thread may cause start_kernel to
278@@ -534,9 +522,8 @@
279 {
280 kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
281 unlock_kernel();
282- current->need_resched = 1;
283- cpu_idle();
284-}
285+ cpu_idle();
286+}
287
288 /*
289 * Activate the first processor.
290@@ -617,14 +604,18 @@
291 ipc_init();
292 #endif
293 check_bugs();
294+
295 printk("POSIX conformance testing by UNIFIX\n");
296
297- /*
298- * We count on the initial thread going ok
299- * Like idlers init is an unlocked kernel thread, which will
300- * make syscalls (and thus be locked).
301+ init_idle(current, smp_processor_id());
302+ /*
303+ * We count on the initial thread going ok
304+ * Like idlers init is an unlocked kernel thread, which will
305+ * make syscalls (and thus be locked).
306 */
307 smp_init();
308+
309+ /* Do the rest non-__init'ed, we're now alive */
310 rest_init();
311 }
312
313@@ -785,12 +776,9 @@
314 int i, pid;
315
316 pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD);
317- if (pid > 0) {
318- while (pid != wait(&i)) {
319- current->policy |= SCHED_YIELD;
320- schedule();
321- }
322- }
323+ if (pid > 0)
324+ while (pid != wait(&i))
325+ yield();
326 if (MAJOR(real_root_dev) != RAMDISK_MAJOR
327 || MINOR(real_root_dev) != 0) {
328 error = change_root(real_root_dev,"/initrd");
329--- linux/kernel/sched.c.orig Tue Feb 5 13:51:51 2002
330+++ linux/kernel/sched.c Tue Feb 5 13:52:12 2002
331@@ -12,333 +12,306 @@
332 * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar
333 */
334
335-/*
336- * 'sched.c' is the main kernel file. It contains scheduling primitives
337- * (sleep_on, wakeup, schedule etc) as well as a number of simple system
338- * call functions (type getpid()), which just extract a field from
339- * current-task
340- */
341-
342-#include <linux/config.h>
343 #include <linux/mm.h>
344+#include <linux/nmi.h>
345 #include <linux/init.h>
346+#include <asm/uaccess.h>
347 #include <linux/smp_lock.h>
348-#include <linux/nmi.h>
349 #include <linux/interrupt.h>
350-#include <linux/kernel_stat.h>
351-#include <linux/completion.h>
352-#include <linux/prefetch.h>
353-#include <linux/compiler.h>
354-
355-#include <asm/uaccess.h>
356 #include <asm/mmu_context.h>
357-
358-extern void timer_bh(void);
359-extern void tqueue_bh(void);
360-extern void immediate_bh(void);
361+#include <linux/kernel_stat.h>
362
363 /*
364- * scheduler variables
365+ * Priority of a process goes from 0 to 139. The 0-99
366+ * priority range is allocated to RT tasks, the 100-139
367+ * range is for SCHED_OTHER tasks. Priority values are
368+ * inverted: lower p->prio value means higher priority.
369 */
370-
371-unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
372-
373-extern void mem_use(void);
374+#define MAX_RT_PRIO 100
375+#define MAX_PRIO (MAX_RT_PRIO + 40)
376
377 /*
378- * Scheduling quanta.
379- *
380- * NOTE! The unix "nice" value influences how long a process
381- * gets. The nice value ranges from -20 to +19, where a -20
382- * is a "high-priority" task, and a "+10" is a low-priority
383- * task.
384- *
385- * We want the time-slice to be around 50ms or so, so this
386- * calculation depends on the value of HZ.
387+ * Convert user-nice values [ -20 ... 0 ... 19 ]
388+ * to static priority [ 100 ... 139 (MAX_PRIO-1) ],
389+ * and back.
390 */
391-#if HZ < 200
392-#define TICK_SCALE(x) ((x) >> 2)
393-#elif HZ < 400
394-#define TICK_SCALE(x) ((x) >> 1)
395-#elif HZ < 800
396-#define TICK_SCALE(x) (x)
397-#elif HZ < 1600
398-#define TICK_SCALE(x) ((x) << 1)
399-#else
400-#define TICK_SCALE(x) ((x) << 2)
401-#endif
402-
403-#define NICE_TO_TICKS(nice) (TICK_SCALE(20-(nice))+1)
404+#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
405+#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
406+#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
407
408+/*
409+ * 'User priority' is the nice value converted to something we
410+ * can work with better when scaling various scheduler parameters,
411+ * it's a [ 0 ... 39 ] range.
412+ */
413+#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
414+#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
415+#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
416
417 /*
418- * Init task must be ok at boot for the ix86 as we will check its signals
419- * via the SMP irq return path.
420+ * These are the 'tuning knobs' of the scheduler:
421+ *
422+ * Minimum timeslice is 10 msecs, default timeslice is 150 msecs,
423+ * maximum timeslice is 300 msecs. Timeslices get refilled after
424+ * they expire.
425 */
426-
427-struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
428+#define MIN_TIMESLICE ( 10 * HZ / 1000)
429+#define MAX_TIMESLICE (300 * HZ / 1000)
430+#define CHILD_PENALTY 95
431+#define PARENT_PENALTY 100
432+#define EXIT_WEIGHT 3
433+#define PRIO_BONUS_RATIO 25
434+#define INTERACTIVE_DELTA 2
435+#define MAX_SLEEP_AVG (2*HZ)
436+#define STARVATION_LIMIT (2*HZ)
437
438 /*
439- * The tasklist_lock protects the linked list of processes.
440+ * If a task is 'interactive' then we reinsert it in the active
441+ * array after it has expired its current timeslice. (it will not
442+ * continue to run immediately, it will still roundrobin with
443+ * other interactive tasks.)
444 *
445- * The runqueue_lock locks the parts that actually access
446- * and change the run-queues, and have to be interrupt-safe.
447+ * This part scales the interactivity limit depending on niceness.
448 *
449- * If both locks are to be concurrently held, the runqueue_lock
450- * nests inside the tasklist_lock.
451+ * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
452+ * Here are a few examples of different nice levels:
453 *
454- * task->alloc_lock nests inside tasklist_lock.
455+ * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
456+ * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
457+ * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
458+ * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
459+ * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
460+ *
461+ * (the X axis represents the possible -5 ... 0 ... +5 dynamic
462+ * priority range a task can explore, a value of '1' means the
463+ * task is rated interactive.)
464+ *
465+ * Ie. nice +19 tasks can never get 'interactive' enough to be
466+ * reinserted into the active array. And only heavily CPU-hog nice -20
467+ * tasks will be expired. Default nice 0 tasks are somewhere between,
468+ * it takes some effort for them to get interactive, but it's not
469+ * too hard.
470 */
471-spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */
472-rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */
473
474-static LIST_HEAD(runqueue_head);
475+#define SCALE(v1,v1_max,v2_max) \
476+ (v1) * (v2_max) / (v1_max)
477
478-/*
479- * We align per-CPU scheduling data on cacheline boundaries,
480- * to prevent cacheline ping-pong.
481- */
482-static union {
483- struct schedule_data {
484- struct task_struct * curr;
485- cycles_t last_schedule;
486- } schedule_data;
487- char __pad [SMP_CACHE_BYTES];
488-} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
489+#define DELTA(p) \
490+ (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
491+ INTERACTIVE_DELTA)
492
493-#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
494-#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
495+#define TASK_INTERACTIVE(p) \
496+ ((p)->prio <= (p)->static_prio - DELTA(p))
497
498-struct kernel_stat kstat;
499-extern struct task_struct *child_reaper;
500+/*
501+ * TASK_TIMESLICE scales user-nice values [ -20 ... 19 ]
502+ * to time slice values.
503+ *
504+ * The higher a process's priority, the bigger timeslices
505+ * it gets during one round of execution. But even the lowest
506+ * priority process gets MIN_TIMESLICE worth of execution time.
507+ */
508
509-#ifdef CONFIG_SMP
510+#define TASK_TIMESLICE(p) (MIN_TIMESLICE + \
511+ ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/39))
512
513-#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
514-#define can_schedule(p,cpu) \
515- ((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu))
516+/*
517+ * These are the runqueue data structures:
518+ */
519
520-#else
521+#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
522
523-#define idle_task(cpu) (&init_task)
524-#define can_schedule(p,cpu) (1)
525+typedef struct runqueue runqueue_t;
526
527-#endif
528-
529-void scheduling_functions_start_here(void) { }
530+struct prio_array {
531+ int nr_active;
532+ spinlock_t *lock;
533+ runqueue_t *rq;
534+ unsigned long bitmap[BITMAP_SIZE];
535+ list_t queue[MAX_PRIO];
536+};
537
538 /*
539- * This is the function that decides how desirable a process is..
540- * You can weigh different processes against each other depending
541- * on what CPU they've run on lately etc to try to handle cache
542- * and TLB miss penalties.
543+ * This is the main, per-CPU runqueue data structure.
544 *
545- * Return values:
546- * -1000: never select this
547- * 0: out of time, recalculate counters (but it might still be
548- * selected)
549- * +ve: "goodness" value (the larger, the better)
550- * +1000: realtime process, select this.
551+ * Locking rule: those places that want to lock multiple runqueues
552+ * (such as the load balancing or the process migration code), lock
553+ * acquire operations must be ordered by ascending &runqueue.
554 */
555+struct runqueue {
556+ spinlock_t lock;
557+ unsigned long nr_running, nr_switches, expired_timestamp;
558+ task_t *curr, *idle;
559+ prio_array_t *active, *expired, arrays[2];
560+ int prev_nr_running[NR_CPUS];
561+} ____cacheline_aligned;
562
563-static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
564-{
565- int weight;
566+static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
567
568- /*
569- * select the current process after every other
570- * runnable process, but before the idle thread.
571- * Also, dont trigger a counter recalculation.
572- */
573- weight = -1;
574- if (p->policy & SCHED_YIELD)
575- goto out;
576+#define cpu_rq(cpu) (runqueues + (cpu))
577+#define this_rq() cpu_rq(smp_processor_id())
578+#define task_rq(p) cpu_rq((p)->cpu)
579+#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
580+#define rt_task(p) ((p)->prio < MAX_RT_PRIO)
581
582- /*
583- * Non-RT process - normal case first.
584- */
585- if (p->policy == SCHED_OTHER) {
586- /*
587- * Give the process a first-approximation goodness value
588- * according to the number of clock-ticks it has left.
589- *
590- * Don't do any other calculations if the time slice is
591- * over..
592- */
593- weight = p->counter;
594- if (!weight)
595- goto out;
596-
597-#ifdef CONFIG_SMP
598- /* Give a largish advantage to the same processor... */
599- /* (this is equivalent to penalizing other processors) */
600- if (p->processor == this_cpu)
601- weight += PROC_CHANGE_PENALTY;
602-#endif
603+static inline runqueue_t *lock_task_rq(task_t *p, unsigned long *flags)
604+{
605+ struct runqueue *__rq;
606
607- /* .. and a slight advantage to the current MM */
608- if (p->mm == this_mm || !p->mm)
609- weight += 1;
610- weight += 20 - p->nice;
611- goto out;
612+repeat_lock_task:
613+ __rq = task_rq(p);
614+ spin_lock_irqsave(&__rq->lock, *flags);
615+ if (unlikely(__rq != task_rq(p))) {
616+ spin_unlock_irqrestore(&__rq->lock, *flags);
617+ goto repeat_lock_task;
618 }
619+ return __rq;
620+}
621
622- /*
623- * Realtime process, select the first one on the
624- * runqueue (taking priorities within processes
625- * into account).
626- */
627- weight = 1000 + p->rt_priority;
628-out:
629- return weight;
630+static inline void unlock_task_rq(runqueue_t *rq, unsigned long *flags)
631+{
632+ spin_unlock_irqrestore(&rq->lock, *flags);
633 }
634
635 /*
636- * the 'goodness value' of replacing a process on a given CPU.
637- * positive value means 'replace', zero or negative means 'dont'.
638+ * Adding/removing a task to/from a priority array:
639 */
640-static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
641+static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
642 {
643- return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
644+ array->nr_active--;
645+ list_del_init(&p->run_list);
646+ if (list_empty(array->queue + p->prio))
647+ __clear_bit(p->prio, array->bitmap);
648 }
649
650-/*
651- * This is ugly, but reschedule_idle() is very timing-critical.
652- * We are called with the runqueue spinlock held and we must
653- * not claim the tasklist_lock.
654- */
655-static FASTCALL(void reschedule_idle(struct task_struct * p));
656+static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
657+{
658+ list_add_tail(&p->run_list, array->queue + p->prio);
659+ __set_bit(p->prio, array->bitmap);
660+ array->nr_active++;
661+ p->array = array;
662+}
663
664-static void reschedule_idle(struct task_struct * p)
665+static inline int effective_prio(task_t *p)
666 {
667-#ifdef CONFIG_SMP
668- int this_cpu = smp_processor_id();
669- struct task_struct *tsk, *target_tsk;
670- int cpu, best_cpu, i, max_prio;
671- cycles_t oldest_idle;
672-
673- /*
674- * shortcut if the woken up task's last CPU is
675- * idle now.
676- */
677- best_cpu = p->processor;
678- if (can_schedule(p, best_cpu)) {
679- tsk = idle_task(best_cpu);
680- if (cpu_curr(best_cpu) == tsk) {
681- int need_resched;
682-send_now_idle:
683- /*
684- * If need_resched == -1 then we can skip sending
685- * the IPI altogether, tsk->need_resched is
686- * actively watched by the idle thread.
687- */
688- need_resched = tsk->need_resched;
689- tsk->need_resched = 1;
690- if ((best_cpu != this_cpu) && !need_resched)
691- smp_send_reschedule(best_cpu);
692- return;
693- }
694- }
695+ int bonus, prio;
696
697 /*
698- * We know that the preferred CPU has a cache-affine current
699- * process, lets try to find a new idle CPU for the woken-up
700- * process. Select the least recently active idle CPU. (that
701- * one will have the least active cache context.) Also find
702- * the executing process which has the least priority.
703- */
704- oldest_idle = (cycles_t) -1;
705- target_tsk = NULL;
706- max_prio = 0;
707+ * Here we scale the actual sleep average [0 .... MAX_SLEEP_AVG]
708+ * into the -5 ... 0 ... +5 bonus/penalty range.
709+ *
710+ * We use 25% of the full 0...39 priority range so that:
711+ *
712+ * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
713+ * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
714+ *
715+ * Both properties are important to certain workloads.
716+ */
717+ bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
718+ MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
719
720- for (i = 0; i < smp_num_cpus; i++) {
721- cpu = cpu_logical_map(i);
722- if (!can_schedule(p, cpu))
723- continue;
724- tsk = cpu_curr(cpu);
725+ prio = p->static_prio - bonus;
726+ if (prio < MAX_RT_PRIO)
727+ prio = MAX_RT_PRIO;
728+ if (prio > MAX_PRIO-1)
729+ prio = MAX_PRIO-1;
730+ return prio;
731+}
732+
733+static inline void activate_task(task_t *p, runqueue_t *rq)
734+{
735+ unsigned long sleep_time = jiffies - p->sleep_timestamp;
736+ prio_array_t *array = rq->active;
737+
738+ if (!rt_task(p) && sleep_time) {
739 /*
740- * We use the first available idle CPU. This creates
741- * a priority list between idle CPUs, but this is not
742- * a problem.
743+ * This code gives a bonus to interactive tasks. We update
744+ * an 'average sleep time' value here, based on
745+ * sleep_timestamp. The more time a task spends sleeping,
746+ * the higher the average gets - and the higher the priority
747+ * boost gets as well.
748 */
749- if (tsk == idle_task(cpu)) {
750-#if defined(__i386__) && defined(CONFIG_SMP)
751- /*
752- * Check if two siblings are idle in the same
753- * physical package. Use them if found.
754- */
755- if (smp_num_siblings == 2) {
756- if (cpu_curr(cpu_sibling_map[cpu]) ==
757- idle_task(cpu_sibling_map[cpu])) {
758- oldest_idle = last_schedule(cpu);
759- target_tsk = tsk;
760- break;
761- }
762-
763- }
764-#endif
765- if (last_schedule(cpu) < oldest_idle) {
766- oldest_idle = last_schedule(cpu);
767- target_tsk = tsk;
768- }
769- } else {
770- if (oldest_idle == -1ULL) {
771- int prio = preemption_goodness(tsk, p, cpu);
772-
773- if (prio > max_prio) {
774- max_prio = prio;
775- target_tsk = tsk;
776- }
777- }
778- }
779- }
780- tsk = target_tsk;
781- if (tsk) {
782- if (oldest_idle != -1ULL) {
783- best_cpu = tsk->processor;
784- goto send_now_idle;
785- }
786- tsk->need_resched = 1;
787- if (tsk->processor != this_cpu)
788- smp_send_reschedule(tsk->processor);
789+ p->sleep_avg += sleep_time;
790+ if (p->sleep_avg > MAX_SLEEP_AVG)
791+ p->sleep_avg = MAX_SLEEP_AVG;
792+ p->prio = effective_prio(p);
793 }
794- return;
795-
796+ enqueue_task(p, array);
797+ rq->nr_running++;
798+}
799
800-#else /* UP */
801- int this_cpu = smp_processor_id();
802- struct task_struct *tsk;
803-
804- tsk = cpu_curr(this_cpu);
805- if (preemption_goodness(tsk, p, this_cpu) > 0)
806- tsk->need_resched = 1;
807-#endif
808+static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
809+{
810+ rq->nr_running--;
811+ dequeue_task(p, p->array);
812+ p->array = NULL;
813 }
814
815+static inline void resched_task(task_t *p)
816+{
817+ int need_resched;
818+
819+ need_resched = p->need_resched;
820+ wmb();
821+ p->need_resched = 1;
822+ if (!need_resched && (p->cpu != smp_processor_id()))
823+ smp_send_reschedule(p->cpu);
824+}
825+
826+#ifdef CONFIG_SMP
827+
828 /*
829- * Careful!
830- *
831- * This has to add the process to the _beginning_ of the
832- * run-queue, not the end. See the comment about "This is
833- * subtle" in the scheduler proper..
834+ * Wait for a process to unschedule. This is used by the exit() and
835+ * ptrace() code.
836 */
837-static inline void add_to_runqueue(struct task_struct * p)
838+void wait_task_inactive(task_t * p)
839 {
840- list_add(&p->run_list, &runqueue_head);
841- nr_running++;
842+ unsigned long flags;
843+ runqueue_t *rq;
844+
845+repeat:
846+ rq = task_rq(p);
847+ while (unlikely(rq->curr == p)) {
848+ cpu_relax();
849+ barrier();
850+ }
851+ rq = lock_task_rq(p, &flags);
852+ if (unlikely(rq->curr == p)) {
853+ unlock_task_rq(rq, &flags);
854+ goto repeat;
855+ }
856+ unlock_task_rq(rq, &flags);
857 }
858
859-static inline void move_last_runqueue(struct task_struct * p)
860+/*
861+ * The SMP message passing code calls this function whenever
862+ * the new task has arrived at the target CPU. We move the
863+ * new task into the local runqueue.
864+ *
865+ * This function must be called with interrupts disabled.
866+ */
867+void sched_task_migrated(task_t *new_task)
868 {
869- list_del(&p->run_list);
870- list_add_tail(&p->run_list, &runqueue_head);
871+ wait_task_inactive(new_task);
872+ new_task->cpu = smp_processor_id();
873+ wake_up_process(new_task);
874 }
875
876-static inline void move_first_runqueue(struct task_struct * p)
877+/*
878+ * Kick the remote CPU if the task is running currently,
879+ * this code is used by the signal code to signal tasks
880+ * which are in user-mode as quickly as possible.
881+ *
882+ * (Note that we do this lockless - if the task does anything
883+ * while the message is in flight then it will notice the
884+ * sigpending condition anyway.)
885+ */
886+void kick_if_running(task_t * p)
887 {
888- list_del(&p->run_list);
889- list_add(&p->run_list, &runqueue_head);
890+ if (p == task_rq(p)->curr)
891+ resched_task(p);
892 }
893+#endif
894
895 /*
896 * Wake up a process. Put it on the run-queue if it's not
897@@ -348,392 +321,528 @@
898 * "current->state = TASK_RUNNING" to mark yourself runnable
899 * without the overhead of this.
900 */
901-static inline int try_to_wake_up(struct task_struct * p, int synchronous)
902+static int try_to_wake_up(task_t * p, int synchronous)
903 {
904 unsigned long flags;
905 int success = 0;
906+ runqueue_t *rq;
907
908- /*
909- * We want the common case fall through straight, thus the goto.
910- */
911- spin_lock_irqsave(&runqueue_lock, flags);
912+ rq = lock_task_rq(p, &flags);
913 p->state = TASK_RUNNING;
914- if (task_on_runqueue(p))
915- goto out;
916- add_to_runqueue(p);
917- if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
918- reschedule_idle(p);
919- success = 1;
920-out:
921- spin_unlock_irqrestore(&runqueue_lock, flags);
922+ if (!p->array) {
923+ activate_task(p, rq);
924+ if ((rq->curr == rq->idle) || (p->prio < rq->curr->prio))
925+ resched_task(rq->curr);
926+ success = 1;
927+ }
928+ unlock_task_rq(rq, &flags);
929 return success;
930 }
931
932-inline int wake_up_process(struct task_struct * p)
933+int wake_up_process(task_t * p)
934 {
935 return try_to_wake_up(p, 0);
936 }
937
938-static void process_timeout(unsigned long __data)
939+void wake_up_forked_process(task_t * p)
940 {
941- struct task_struct * p = (struct task_struct *) __data;
942+ runqueue_t *rq = this_rq();
943
944- wake_up_process(p);
945+ p->state = TASK_RUNNING;
946+ if (!rt_task(p)) {
947+ /*
948+ * We decrease the sleep average of forking parents
949+ * and children as well, to keep max-interactive tasks
950+ * from forking tasks that are max-interactive.
951+ */
952+ current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
953+ p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
954+ p->prio = effective_prio(p);
955+ }
956+ spin_lock_irq(&rq->lock);
957+ p->cpu = smp_processor_id();
958+ activate_task(p, rq);
959+ spin_unlock_irq(&rq->lock);
960 }
961
962-/**
963- * schedule_timeout - sleep until timeout
964- * @timeout: timeout value in jiffies
965- *
966- * Make the current task sleep until @timeout jiffies have
967- * elapsed. The routine will return immediately unless
968- * the current task state has been set (see set_current_state()).
969- *
970- * You can set the task state as follows -
971- *
972- * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
973- * pass before the routine returns. The routine will return 0
974- *
975- * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
976- * delivered to the current task. In this case the remaining time
977- * in jiffies will be returned, or 0 if the timer expired in time
978- *
979- * The current task state is guaranteed to be TASK_RUNNING when this
980- * routine returns.
981- *
982- * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
983- * the CPU away without a bound on the timeout. In this case the return
984- * value will be %MAX_SCHEDULE_TIMEOUT.
985+/*
986+ * Potentially available exiting-child timeslices are
987+ * retrieved here - this way the parent does not get
988+ * penalized for creating too many processes.
989 *
990- * In all cases the return value is guaranteed to be non-negative.
991+ * (this cannot be used to 'generate' timeslices
992+ * artificially, because any timeslice recovered here
993+ * was given away by the parent in the first place.)
994 */
995-signed long schedule_timeout(signed long timeout)
996+void sched_exit(task_t * p)
997 {
998- struct timer_list timer;
999- unsigned long expire;
1000+ __cli();
1001+ current->time_slice += p->time_slice;
1002+ if (unlikely(current->time_slice > MAX_TIMESLICE))
1003+ current->time_slice = MAX_TIMESLICE;
1004+ __sti();
1005+ /*
1006+ * If the child was a (relative-) CPU hog then decrease
1007+ * the sleep_avg of the parent as well.
1008+ */
1009+ if (p->sleep_avg < current->sleep_avg)
1010+ current->sleep_avg = (current->sleep_avg * EXIT_WEIGHT +
1011+ p->sleep_avg) / (EXIT_WEIGHT + 1);
1012+}
1013
1014- switch (timeout)
1015- {
1016- case MAX_SCHEDULE_TIMEOUT:
1017- /*
1018- * These two special cases are useful to be comfortable
1019- * in the caller. Nothing more. We could take
1020- * MAX_SCHEDULE_TIMEOUT from one of the negative value
1021- * but I' d like to return a valid offset (>=0) to allow
1022- * the caller to do everything it want with the retval.
1023- */
1024- schedule();
1025- goto out;
1026- default:
1027- /*
1028- * Another bit of PARANOID. Note that the retval will be
1029- * 0 since no piece of kernel is supposed to do a check
1030- * for a negative retval of schedule_timeout() (since it
1031- * should never happens anyway). You just have the printk()
1032- * that will tell you if something is gone wrong and where.
1033- */
1034- if (timeout < 0)
1035- {
1036- printk(KERN_ERR "schedule_timeout: wrong timeout "
1037- "value %lx from %p\n", timeout,
1038- __builtin_return_address(0));
1039- current->state = TASK_RUNNING;
1040- goto out;
1041- }
1042- }
1043+#if CONFIG_SMP
1044+asmlinkage void schedule_tail(task_t *prev)
1045+{
1046+ spin_unlock_irq(&this_rq()->lock);
1047+}
1048+#endif
1049
1050- expire = timeout + jiffies;
1051+static inline void context_switch(task_t *prev, task_t *next)
1052+{
1053+ struct mm_struct *mm = next->mm;
1054+ struct mm_struct *oldmm = prev->active_mm;
1055
1056- init_timer(&timer);
1057- timer.expires = expire;
1058- timer.data = (unsigned long) current;
1059- timer.function = process_timeout;
1060+ prepare_to_switch();
1061
1062- add_timer(&timer);
1063- schedule();
1064- del_timer_sync(&timer);
1065+ if (unlikely(!mm)) {
1066+ next->active_mm = oldmm;
1067+ atomic_inc(&oldmm->mm_count);
1068+ enter_lazy_tlb(oldmm, next, smp_processor_id());
1069+ } else
1070+ switch_mm(oldmm, mm, next, smp_processor_id());
1071
1072- timeout = expire - jiffies;
1073+ if (unlikely(!prev->mm)) {
1074+ prev->active_mm = NULL;
1075+ mmdrop(oldmm);
1076+ }
1077
1078- out:
1079- return timeout < 0 ? 0 : timeout;
1080+ /*
1081+ * Here we just switch the register state and the stack. There are
1082+ * 3 processes affected by a context switch:
1083+ *
1084+ * prev ==> .... ==> (last => next)
1085+ *
1086+ * It's the 'much more previous' 'prev' that is on next's stack,
1087+ * but prev is set to (the just run) 'last' process by switch_to().
1088+ * This might sound slightly confusing but makes tons of sense.
1089+ */
1090+ switch_to(prev, next, prev);
1091 }
1092
1093-/*
1094- * schedule_tail() is getting called from the fork return path. This
1095- * cleans up all remaining scheduler things, without impacting the
1096- * common case.
1097- */
1098-static inline void __schedule_tail(struct task_struct *prev)
1099+unsigned long nr_running(void)
1100 {
1101-#ifdef CONFIG_SMP
1102- int policy;
1103-
1104- /*
1105- * prev->policy can be written from here only before `prev'
1106- * can be scheduled (before setting prev->cpus_runnable to ~0UL).
1107- * Of course it must also be read before allowing prev
1108- * to be rescheduled, but since the write depends on the read
1109- * to complete, wmb() is enough. (the spin_lock() acquired
1110- * before setting cpus_runnable is not enough because the spin_lock()
1111- * common code semantics allows code outside the critical section
1112- * to enter inside the critical section)
1113- */
1114- policy = prev->policy;
1115- prev->policy = policy & ~SCHED_YIELD;
1116- wmb();
1117+ unsigned long i, sum = 0;
1118
1119- /*
1120- * fast path falls through. We have to clear cpus_runnable before
1121- * checking prev->state to avoid a wakeup race. Protect against
1122- * the task exiting early.
1123- */
1124- task_lock(prev);
1125- task_release_cpu(prev);
1126- mb();
1127- if (prev->state == TASK_RUNNING)
1128- goto needs_resched;
1129+ for (i = 0; i < smp_num_cpus; i++)
1130+ sum += cpu_rq(cpu_logical_map(i))->nr_running;
1131
1132-out_unlock:
1133- task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */
1134- return;
1135+ return sum;
1136+}
1137
1138- /*
1139- * Slow path - we 'push' the previous process and
1140- * reschedule_idle() will attempt to find a new
1141- * processor for it. (but it might preempt the
1142- * current process as well.) We must take the runqueue
1143- * lock and re-check prev->state to be correct. It might
1144- * still happen that this process has a preemption
1145- * 'in progress' already - but this is not a problem and
1146- * might happen in other circumstances as well.
1147- */
1148-needs_resched:
1149- {
1150- unsigned long flags;
1151+unsigned long nr_context_switches(void)
1152+{
1153+ unsigned long i, sum = 0;
1154
1155- /*
1156- * Avoid taking the runqueue lock in cases where
1157- * no preemption-check is necessery:
1158- */
1159- if ((prev == idle_task(smp_processor_id())) ||
1160- (policy & SCHED_YIELD))
1161- goto out_unlock;
1162+ for (i = 0; i < smp_num_cpus; i++)
1163+ sum += cpu_rq(cpu_logical_map(i))->nr_switches;
1164
1165- spin_lock_irqsave(&runqueue_lock, flags);
1166- if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev))
1167- reschedule_idle(prev);
1168- spin_unlock_irqrestore(&runqueue_lock, flags);
1169- goto out_unlock;
1170- }
1171-#else
1172- prev->policy &= ~SCHED_YIELD;
1173-#endif /* CONFIG_SMP */
1174+ return sum;
1175 }
1176
1177-asmlinkage void schedule_tail(struct task_struct *prev)
1178+#if CONFIG_SMP
1179+/*
1180+ * Lock the busiest runqueue as well, this_rq is locked already.
1181+ * Recalculate nr_running if we have to drop the runqueue lock.
1182+ */
1183+static inline unsigned int double_lock_balance(runqueue_t *this_rq,
1184+ runqueue_t *busiest, int this_cpu, int idle, unsigned int nr_running)
1185 {
1186- __schedule_tail(prev);
1187+ if (unlikely(!spin_trylock(&busiest->lock))) {
1188+ if (busiest < this_rq) {
1189+ spin_unlock(&this_rq->lock);
1190+ spin_lock(&busiest->lock);
1191+ spin_lock(&this_rq->lock);
1192+ /* Need to recalculate nr_running */
1193+ if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
1194+ nr_running = this_rq->nr_running;
1195+ else
1196+ nr_running = this_rq->prev_nr_running[this_cpu];
1197+ } else
1198+ spin_lock(&busiest->lock);
1199+ }
1200+ return nr_running;
1201 }
1202
1203 /*
1204- * 'schedule()' is the scheduler function. It's a very simple and nice
1205- * scheduler: it's not perfect, but certainly works for most things.
1206- *
1207- * The goto is "interesting".
1208+ * Current runqueue is empty, or rebalance tick: if there is an
1209+ * inbalance (current runqueue is too short) then pull from
1210+ * busiest runqueue(s).
1211 *
1212- * NOTE!! Task 0 is the 'idle' task, which gets called when no other
1213- * tasks can run. It can not be killed, and it cannot sleep. The 'state'
1214- * information in task[0] is never used.
1215+ * We call this with the current runqueue locked,
1216+ * irqs disabled.
1217 */
1218-asmlinkage void schedule(void)
1219+static void load_balance(runqueue_t *this_rq, int idle)
1220 {
1221- struct schedule_data * sched_data;
1222- struct task_struct *prev, *next, *p;
1223- struct list_head *tmp;
1224- int this_cpu, c;
1225+ int imbalance, nr_running, load, max_load,
1226+ idx, i, this_cpu = smp_processor_id();
1227+ task_t *next = this_rq->idle, *tmp;
1228+ runqueue_t *busiest, *rq_src;
1229+ prio_array_t *array;
1230+ list_t *head, *curr;
1231
1232+ /*
1233+ * We search all runqueues to find the most busy one.
1234+ * We do this lockless to reduce cache-bouncing overhead,
1235+ * we re-check the 'best' source CPU later on again, with
1236+ * the lock held.
1237+ *
1238+ * We fend off statistical fluctuations in runqueue lengths by
1239+ * saving the runqueue length during the previous load-balancing
1240+ * operation and using the smaller one the current and saved lengths.
1241+ * If a runqueue is long enough for a longer amount of time then
1242+ * we recognize it and pull tasks from it.
1243+ *
1244+ * The 'current runqueue length' is a statistical maximum variable,
1245+ * for that one we take the longer one - to avoid fluctuations in
1246+ * the other direction. So for a load-balance to happen it needs
1247+ * stable long runqueue on the target CPU and stable short runqueue
1248+ * on the local runqueue.
1249+ *
1250+ * We make an exception if this CPU is about to become idle - in
1251+ * that case we are less picky about moving a task across CPUs and
1252+ * take what can be taken.
1253+ */
1254+ if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
1255+ nr_running = this_rq->nr_running;
1256+ else
1257+ nr_running = this_rq->prev_nr_running[this_cpu];
1258
1259- spin_lock_prefetch(&runqueue_lock);
1260+ busiest = NULL;
1261+ max_load = 1;
1262+ for (i = 0; i < smp_num_cpus; i++) {
1263+ rq_src = cpu_rq(cpu_logical_map(i));
1264+ if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i]))
1265+ load = rq_src->nr_running;
1266+ else
1267+ load = this_rq->prev_nr_running[i];
1268+ this_rq->prev_nr_running[i] = rq_src->nr_running;
1269+
1270+ if ((load > max_load) && (rq_src != this_rq)) {
1271+ busiest = rq_src;
1272+ max_load = load;
1273+ }
1274+ }
1275
1276- if (!current->active_mm) BUG();
1277-need_resched_back:
1278- prev = current;
1279- this_cpu = prev->processor;
1280+ if (likely(!busiest))
1281+ return;
1282
1283- if (unlikely(in_interrupt())) {
1284- printk("Scheduling in interrupt\n");
1285- BUG();
1286- }
1287+ imbalance = (max_load - nr_running) / 2;
1288
1289- release_kernel_lock(prev, this_cpu);
1290+ /* It needs an at least ~25% imbalance to trigger balancing. */
1291+ if (!idle && (imbalance < (max_load + 3)/4))
1292+ return;
1293
1294+ nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running);
1295 /*
1296- * 'sched_data' is protected by the fact that we can run
1297- * only one process per CPU.
1298+ * Make sure nothing changed since we checked the
1299+ * runqueue length.
1300 */
1301- sched_data = & aligned_data[this_cpu].schedule_data;
1302+ if (busiest->nr_running <= this_rq->nr_running + 1)
1303+ goto out_unlock;
1304
1305- spin_lock_irq(&runqueue_lock);
1306+ /*
1307+ * We first consider expired tasks. Those will likely not be
1308+ * executed in the near future, and they are most likely to
1309+ * be cache-cold, thus switching CPUs has the least effect
1310+ * on them.
1311+ */
1312+ if (busiest->expired->nr_active)
1313+ array = busiest->expired;
1314+ else
1315+ array = busiest->active;
1316
1317- /* move an exhausted RR process to be last.. */
1318- if (unlikely(prev->policy == SCHED_RR))
1319- if (!prev->counter) {
1320- prev->counter = NICE_TO_TICKS(prev->nice);
1321- move_last_runqueue(prev);
1322+new_array:
1323+ /* Start searching at priority 0: */
1324+ idx = 0;
1325+skip_bitmap:
1326+ if (!idx)
1327+ idx = sched_find_first_bit(array->bitmap);
1328+ else
1329+ idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1330+ if (idx == MAX_PRIO) {
1331+ if (array == busiest->expired) {
1332+ array = busiest->active;
1333+ goto new_array;
1334 }
1335-
1336- switch (prev->state) {
1337- case TASK_INTERRUPTIBLE:
1338- if (signal_pending(prev)) {
1339- prev->state = TASK_RUNNING;
1340- break;
1341- }
1342- default:
1343- del_from_runqueue(prev);
1344- case TASK_RUNNING:;
1345+ goto out_unlock;
1346 }
1347- prev->need_resched = 0;
1348-
1349- /*
1350- * this is the scheduler proper:
1351- */
1352
1353-repeat_schedule:
1354- /*
1355- * Default process to select..
1356- */
1357- next = idle_task(this_cpu);
1358- c = -1000;
1359- list_for_each(tmp, &runqueue_head) {
1360- p = list_entry(tmp, struct task_struct, run_list);
1361- if (can_schedule(p, this_cpu)) {
1362- int weight = goodness(p, this_cpu, prev->active_mm);
1363- if (weight > c)
1364- c = weight, next = p;
1365+ head = array->queue + idx;
1366+ curr = head->prev;
1367+skip_queue:
1368+ tmp = list_entry(curr, task_t, run_list);
1369+
1370+ /*
1371+ * We do not migrate tasks that are:
1372+ * 1) running (obviously), or
1373+ * 2) cannot be migrated to this CPU due to cpus_allowed, or
1374+ * 3) are cache-hot on their current CPU.
1375+ */
1376+
1377+#define CAN_MIGRATE_TASK(p,rq,this_cpu) \
1378+ ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \
1379+ ((p) != (rq)->curr) && \
1380+ (tmp->cpus_allowed & (1 << (this_cpu))))
1381+
1382+ if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
1383+ curr = curr->next;
1384+ if (curr != head)
1385+ goto skip_queue;
1386+ idx++;
1387+ goto skip_bitmap;
1388+ }
1389+ next = tmp;
1390+ /*
1391+ * take the task out of the other runqueue and
1392+ * put it into this one:
1393+ */
1394+ dequeue_task(next, array);
1395+ busiest->nr_running--;
1396+ next->cpu = this_cpu;
1397+ this_rq->nr_running++;
1398+ enqueue_task(next, this_rq->active);
1399+ if (next->prio < current->prio)
1400+ current->need_resched = 1;
1401+ if (!idle && --imbalance) {
1402+ if (array == busiest->expired) {
1403+ array = busiest->active;
1404+ goto new_array;
1405 }
1406 }
1407+out_unlock:
1408+ spin_unlock(&busiest->lock);
1409+}
1410+
1411+/*
1412+ * One of the idle_cpu_tick() or the busy_cpu_tick() function will
1413+ * gets called every timer tick, on every CPU. Our balancing action
1414+ * frequency and balancing agressivity depends on whether the CPU is
1415+ * idle or not.
1416+ *
1417+ * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on
1418+ * systems with HZ=100, every 10 msecs.)
1419+ */
1420+#define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
1421+#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
1422+
1423+static inline void idle_tick(void)
1424+{
1425+ if (jiffies % IDLE_REBALANCE_TICK)
1426+ return;
1427+ spin_lock(&this_rq()->lock);
1428+ load_balance(this_rq(), 1);
1429+ spin_unlock(&this_rq()->lock);
1430+}
1431+
1432+#endif
1433
1434- /* Do we need to re-calculate counters? */
1435- if (unlikely(!c)) {
1436- struct task_struct *p;
1437-
1438- spin_unlock_irq(&runqueue_lock);
1439- read_lock(&tasklist_lock);
1440- for_each_task(p)
1441- p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
1442- read_unlock(&tasklist_lock);
1443- spin_lock_irq(&runqueue_lock);
1444- goto repeat_schedule;
1445+/*
1446+ * We place interactive tasks back into the active array, if possible.
1447+ *
1448+ * To guarantee that this does not starve expired tasks we ignore the
1449+ * interactivity of a task if the first expired task had to wait more
1450+ * than a 'reasonable' amount of time. This deadline timeout is
1451+ * load-dependent, as the frequency of array switched decreases with
1452+ * increasing number of running tasks:
1453+ */
1454+#define EXPIRED_STARVING(rq) \
1455+ ((rq)->expired_timestamp && \
1456+ (jiffies - (rq)->expired_timestamp >= \
1457+ STARVATION_LIMIT * ((rq)->nr_running) + 1))
1458+
1459+/*
1460+ * This function gets called by the timer code, with HZ frequency.
1461+ * We call it with interrupts disabled.
1462+ */
1463+void scheduler_tick(int user_tick, int system)
1464+{
1465+ int cpu = smp_processor_id();
1466+ runqueue_t *rq = this_rq();
1467+ task_t *p = current;
1468+
1469+ if (p == rq->idle) {
1470+ if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
1471+ kstat.per_cpu_system[cpu] += system;
1472+#if CONFIG_SMP
1473+ idle_tick();
1474+#endif
1475+ return;
1476 }
1477+ if (TASK_NICE(p) > 0)
1478+ kstat.per_cpu_nice[cpu] += user_tick;
1479+ else
1480+ kstat.per_cpu_user[cpu] += user_tick;
1481+ kstat.per_cpu_system[cpu] += system;
1482
1483+ /* Task might have expired already, but not scheduled off yet */
1484+ if (p->array != rq->active) {
1485+ p->need_resched = 1;
1486+ return;
1487+ }
1488+ spin_lock(&rq->lock);
1489+ if (unlikely(rt_task(p))) {
1490+ /*
1491+ * RR tasks need a special form of timeslice management.
1492+ * FIFO tasks have no timeslices.
1493+ */
1494+ if ((p->policy == SCHED_RR) && !--p->time_slice) {
1495+ p->time_slice = TASK_TIMESLICE(p);
1496+ p->need_resched = 1;
1497+
1498+ /* put it at the end of the queue: */
1499+ dequeue_task(p, rq->active);
1500+ enqueue_task(p, rq->active);
1501+ }
1502+ goto out;
1503+ }
1504 /*
1505- * from this point on nothing can prevent us from
1506- * switching to the next task, save this fact in
1507- * sched_data.
1508- */
1509- sched_data->curr = next;
1510- task_set_cpu(next, this_cpu);
1511- spin_unlock_irq(&runqueue_lock);
1512-
1513- if (unlikely(prev == next)) {
1514- /* We won't go through the normal tail, so do this by hand */
1515- prev->policy &= ~SCHED_YIELD;
1516- goto same_process;
1517+ * The task was running during this tick - update the
1518+ * time slice counter and the sleep average. Note: we
1519+ * do not update a process's priority until it either
1520+ * goes to sleep or uses up its timeslice. This makes
1521+ * it possible for interactive tasks to use up their
1522+ * timeslices at their highest priority levels.
1523+ */
1524+ if (p->sleep_avg)
1525+ p->sleep_avg--;
1526+ if (!--p->time_slice) {
1527+ dequeue_task(p, rq->active);
1528+ p->need_resched = 1;
1529+ p->prio = effective_prio(p);
1530+ p->time_slice = TASK_TIMESLICE(p);
1531+
1532+ if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
1533+ if (!rq->expired_timestamp)
1534+ rq->expired_timestamp = jiffies;
1535+ enqueue_task(p, rq->expired);
1536+ } else
1537+ enqueue_task(p, rq->active);
1538 }
1539+out:
1540+#if CONFIG_SMP
1541+ if (!(jiffies % BUSY_REBALANCE_TICK))
1542+ load_balance(rq, 0);
1543+#endif
1544+ spin_unlock(&rq->lock);
1545+}
1546
1547-#ifdef CONFIG_SMP
1548- /*
1549- * maintain the per-process 'last schedule' value.
1550- * (this has to be recalculated even if we reschedule to
1551- * the same process) Currently this is only used on SMP,
1552- * and it's approximate, so we do not have to maintain
1553- * it while holding the runqueue spinlock.
1554- */
1555- sched_data->last_schedule = get_cycles();
1556+void scheduling_functions_start_here(void) { }
1557
1558- /*
1559- * We drop the scheduler lock early (it's a global spinlock),
1560- * thus we have to lock the previous process from getting
1561- * rescheduled during switch_to().
1562- */
1563+/*
1564+ * 'schedule()' is the main scheduler function.
1565+ */
1566+asmlinkage void schedule(void)
1567+{
1568+ task_t *prev = current, *next;
1569+ runqueue_t *rq = this_rq();
1570+ prio_array_t *array;
1571+ list_t *queue;
1572+ int idx;
1573
1574-#endif /* CONFIG_SMP */
1575+ if (unlikely(in_interrupt()))
1576+ BUG();
1577+ release_kernel_lock(prev, smp_processor_id());
1578+ prev->sleep_timestamp = jiffies;
1579+ spin_lock_irq(&rq->lock);
1580
1581- kstat.context_swtch++;
1582- /*
1583- * there are 3 processes which are affected by a context switch:
1584- *
1585- * prev == .... ==> (last => next)
1586- *
1587- * It's the 'much more previous' 'prev' that is on next's stack,
1588- * but prev is set to (the just run) 'last' process by switch_to().
1589- * This might sound slightly confusing but makes tons of sense.
1590- */
1591- prepare_to_switch();
1592- {
1593- struct mm_struct *mm = next->mm;
1594- struct mm_struct *oldmm = prev->active_mm;
1595- if (!mm) {
1596- if (next->active_mm) BUG();
1597- next->active_mm = oldmm;
1598- atomic_inc(&oldmm->mm_count);
1599- enter_lazy_tlb(oldmm, next, this_cpu);
1600- } else {
1601- if (next->active_mm != mm) BUG();
1602- switch_mm(oldmm, mm, next, this_cpu);
1603+ switch (prev->state) {
1604+ case TASK_INTERRUPTIBLE:
1605+ if (unlikely(signal_pending(prev))) {
1606+ prev->state = TASK_RUNNING;
1607+ break;
1608 }
1609+ default:
1610+ deactivate_task(prev, rq);
1611+ case TASK_RUNNING:
1612+ ;
1613+ }
1614+#if CONFIG_SMP
1615+pick_next_task:
1616+#endif
1617+ if (unlikely(!rq->nr_running)) {
1618+#if CONFIG_SMP
1619+ load_balance(rq, 1);
1620+ if (rq->nr_running)
1621+ goto pick_next_task;
1622+#endif
1623+ next = rq->idle;
1624+ rq->expired_timestamp = 0;
1625+ goto switch_tasks;
1626+ }
1627
1628- if (!prev->mm) {
1629- prev->active_mm = NULL;
1630- mmdrop(oldmm);
1631- }
1632+ array = rq->active;
1633+ if (unlikely(!array->nr_active)) {
1634+ /*
1635+ * Switch the active and expired arrays.
1636+ */
1637+ rq->active = rq->expired;
1638+ rq->expired = array;
1639+ array = rq->active;
1640+ rq->expired_timestamp = 0;
1641 }
1642
1643- /*
1644- * This just switches the register state and the
1645- * stack.
1646- */
1647- switch_to(prev, next, prev);
1648- __schedule_tail(prev);
1649+ idx = sched_find_first_bit(array->bitmap);
1650+ queue = array->queue + idx;
1651+ next = list_entry(queue->next, task_t, run_list);
1652+
1653+switch_tasks:
1654+ prefetch(next);
1655+ prev->need_resched = 0;
1656+
1657+ if (likely(prev != next)) {
1658+ rq->nr_switches++;
1659+ rq->curr = next;
1660+ context_switch(prev, next);
1661+ /*
1662+ * The runqueue pointer might be from another CPU
1663+ * if the new task was last running on a different
1664+ * CPU - thus re-load it.
1665+ */
1666+ barrier();
1667+ rq = this_rq();
1668+ }
1669+ spin_unlock_irq(&rq->lock);
1670
1671-same_process:
1672 reacquire_kernel_lock(current);
1673- if (current->need_resched)
1674- goto need_resched_back;
1675 return;
1676 }
1677
1678 /*
1679- * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything
1680- * up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the
1681- * non-exclusive tasks and one exclusive task.
1682+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
1683+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
1684+ * number) then we wake all the non-exclusive tasks and one exclusive task.
1685 *
1686 * There are circumstances in which we can try to wake a task which has already
1687- * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero
1688- * in this (rare) case, and we handle it by contonuing to scan the queue.
1689+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
1690+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
1691 */
1692 static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
1693 int nr_exclusive, const int sync)
1694 {
1695 struct list_head *tmp;
1696- struct task_struct *p;
1697+ task_t *p;
1698
1699- CHECK_MAGIC_WQHEAD(q);
1700- WQ_CHECK_LIST_HEAD(&q->task_list);
1701-
1702 list_for_each(tmp,&q->task_list) {
1703 unsigned int state;
1704- wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
1705+ wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
1706
1707- CHECK_MAGIC(curr->__magic);
1708 p = curr->task;
1709 state = p->state;
1710- if (state & mode) {
1711- WQ_NOTE_WAKER(curr);
1712- if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
1713- break;
1714- }
1715+ if ((state & mode) &&
1716+ try_to_wake_up(p, sync) &&
1717+ ((curr->flags & WQ_FLAG_EXCLUSIVE) &&
1718+ !--nr_exclusive))
1719+ break;
1720 }
1721 }
1722
1723@@ -850,8 +959,71 @@
1724 return timeout;
1725 }
1726
1727+/*
1728+ * Change the current task's CPU affinity. Migrate the process to a
1729+ * proper CPU and schedule away if the current CPU is removed from
1730+ * the allowed bitmask.
1731+ */
1732+void set_cpus_allowed(task_t *p, unsigned long new_mask)
1733+{
1734+ new_mask &= cpu_online_map;
1735+ if (!new_mask)
1736+ BUG();
1737+ if (p != current)
1738+ BUG();
1739+
1740+ p->cpus_allowed = new_mask;
1741+ /*
1742+ * Can the task run on the current CPU? If not then
1743+ * migrate the process off to a proper CPU.
1744+ */
1745+ if (new_mask & (1UL << smp_processor_id()))
1746+ return;
1747+#if CONFIG_SMP
1748+ current->state = TASK_UNINTERRUPTIBLE;
1749+ smp_migrate_task(__ffs(new_mask), current);
1750+
1751+ schedule();
1752+#endif
1753+}
1754+
1755 void scheduling_functions_end_here(void) { }
1756
1757+void set_user_nice(task_t *p, long nice)
1758+{
1759+ unsigned long flags;
1760+ prio_array_t *array;
1761+ runqueue_t *rq;
1762+
1763+ if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
1764+ return;
1765+ /*
1766+ * We have to be careful, if called from sys_setpriority(),
1767+ * the task might be in the middle of scheduling on another CPU.
1768+ */
1769+ rq = lock_task_rq(p, &flags);
1770+ if (rt_task(p)) {
1771+ p->static_prio = NICE_TO_PRIO(nice);
1772+ goto out_unlock;
1773+ }
1774+ array = p->array;
1775+ if (array)
1776+ dequeue_task(p, array);
1777+ p->static_prio = NICE_TO_PRIO(nice);
1778+ p->prio = NICE_TO_PRIO(nice);
1779+ if (array) {
1780+ enqueue_task(p, array);
1781+ /*
1782+ * If the task is running and lowered its priority,
1783+ * or increased its priority then reschedule its CPU:
1784+ */
1785+ if ((NICE_TO_PRIO(nice) < p->static_prio) || (p == rq->curr))
1786+ resched_task(rq->curr);
1787+ }
1788+out_unlock:
1789+ unlock_task_rq(rq, &flags);
1790+}
1791+
1792 #ifndef __alpha__
1793
1794 /*
1795@@ -862,7 +1034,7 @@
1796
1797 asmlinkage long sys_nice(int increment)
1798 {
1799- long newprio;
1800+ long nice;
1801
1802 /*
1803 * Setpriority might change our priority at the same moment.
1804@@ -878,32 +1050,46 @@
1805 if (increment > 40)
1806 increment = 40;
1807
1808- newprio = current->nice + increment;
1809- if (newprio < -20)
1810- newprio = -20;
1811- if (newprio > 19)
1812- newprio = 19;
1813- current->nice = newprio;
1814+ nice = PRIO_TO_NICE(current->static_prio) + increment;
1815+ if (nice < -20)
1816+ nice = -20;
1817+ if (nice > 19)
1818+ nice = 19;
1819+ set_user_nice(current, nice);
1820 return 0;
1821 }
1822
1823 #endif
1824
1825-static inline struct task_struct *find_process_by_pid(pid_t pid)
1826+/*
1827+ * This is the priority value as seen by users in /proc
1828+ *
1829+ * RT tasks are offset by -200. Normal tasks are centered
1830+ * around 0, value goes from -16 to +15.
1831+ */
1832+int task_prio(task_t *p)
1833 {
1834- struct task_struct *tsk = current;
1835+ return p->prio - 100;
1836+}
1837
1838- if (pid)
1839- tsk = find_task_by_pid(pid);
1840- return tsk;
1841+int task_nice(task_t *p)
1842+{
1843+ return TASK_NICE(p);
1844+}
1845+
1846+static inline task_t *find_process_by_pid(pid_t pid)
1847+{
1848+ return pid ? find_task_by_pid(pid) : current;
1849 }
1850
1851-static int setscheduler(pid_t pid, int policy,
1852- struct sched_param *param)
1853+static int setscheduler(pid_t pid, int policy, struct sched_param *param)
1854 {
1855 struct sched_param lp;
1856- struct task_struct *p;
1857+ prio_array_t *array;
1858+ unsigned long flags;
1859+ runqueue_t *rq;
1860 int retval;
1861+ task_t *p;
1862
1863 retval = -EINVAL;
1864 if (!param || pid < 0)
1865@@ -917,14 +1103,19 @@
1866 * We play safe to avoid deadlocks.
1867 */
1868 read_lock_irq(&tasklist_lock);
1869- spin_lock(&runqueue_lock);
1870
1871 p = find_process_by_pid(pid);
1872
1873 retval = -ESRCH;
1874 if (!p)
1875- goto out_unlock;
1876-
1877+ goto out_unlock_tasklist;
1878+
1879+ /*
1880+ * To be able to change p->policy safely, the apropriate
1881+ * runqueue lock must be held.
1882+ */
1883+ rq = lock_task_rq(p, &flags);
1884+
1885 if (policy < 0)
1886 policy = p->policy;
1887 else {
1888@@ -945,30 +1136,36 @@
1889 goto out_unlock;
1890
1891 retval = -EPERM;
1892- if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
1893+ if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
1894 !capable(CAP_SYS_NICE))
1895 goto out_unlock;
1896 if ((current->euid != p->euid) && (current->euid != p->uid) &&
1897 !capable(CAP_SYS_NICE))
1898 goto out_unlock;
1899
1900+ array = p->array;
1901+ if (array)
1902+ deactivate_task(p, task_rq(p));
1903 retval = 0;
1904 p->policy = policy;
1905 p->rt_priority = lp.sched_priority;
1906- if (task_on_runqueue(p))
1907- move_first_runqueue(p);
1908-
1909- current->need_resched = 1;
1910+ if (rt_task(p))
1911+ p->prio = 99 - p->rt_priority;
1912+ else
1913+ p->prio = p->static_prio;
1914+ if (array)
1915+ activate_task(p, task_rq(p));
1916
1917 out_unlock:
1918- spin_unlock(&runqueue_lock);
1919+ unlock_task_rq(rq, &flags);
1920+out_unlock_tasklist:
1921 read_unlock_irq(&tasklist_lock);
1922
1923 out_nounlock:
1924 return retval;
1925 }
1926
1927-asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
1928+asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
1929 struct sched_param *param)
1930 {
1931 return setscheduler(pid, policy, param);
1932@@ -981,7 +1178,7 @@
1933
1934 asmlinkage long sys_sched_getscheduler(pid_t pid)
1935 {
1936- struct task_struct *p;
1937+ task_t *p;
1938 int retval;
1939
1940 retval = -EINVAL;
1941@@ -992,7 +1189,7 @@
1942 read_lock(&tasklist_lock);
1943 p = find_process_by_pid(pid);
1944 if (p)
1945- retval = p->policy & ~SCHED_YIELD;
1946+ retval = p->policy;
1947 read_unlock(&tasklist_lock);
1948
1949 out_nounlock:
1950@@ -1001,7 +1198,7 @@
1951
1952 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
1953 {
1954- struct task_struct *p;
1955+ task_t *p;
1956 struct sched_param lp;
1957 int retval;
1958
1959@@ -1032,42 +1229,64 @@
1960
1961 asmlinkage long sys_sched_yield(void)
1962 {
1963+ task_t *prev = current, *next;
1964+ runqueue_t *rq = this_rq();
1965+ prio_array_t *array;
1966+ list_t *queue;
1967+
1968+ if (unlikely(prev->state != TASK_RUNNING)) {
1969+ schedule();
1970+ return 0;
1971+ }
1972+ release_kernel_lock(prev, smp_processor_id());
1973+ prev->sleep_timestamp = jiffies;
1974 /*
1975- * Trick. sched_yield() first counts the number of truly
1976- * 'pending' runnable processes, then returns if it's
1977- * only the current processes. (This test does not have
1978- * to be atomic.) In threaded applications this optimization
1979- * gets triggered quite often.
1980+ * Decrease the yielding task's priority by one, to avoid
1981+ * livelocks. This priority loss is temporary, it's recovered
1982+ * once the current timeslice expires.
1983+ *
1984+ * If priority is already MAX_PRIO-1 then we still
1985+ * roundrobin the task within the runlist.
1986 */
1987+ spin_lock_irq(&rq->lock);
1988+ array = current->array;
1989+ /*
1990+ * If the task has reached maximum priority (or is a RT task)
1991+ * then just requeue the task to the end of the runqueue:
1992+ */
1993+ if (likely(current->prio == MAX_PRIO-1 || rt_task(current))) {
1994+ list_del(&current->run_list);
1995+ list_add_tail(&current->run_list, array->queue + current->prio);
1996+ } else {
1997+ list_del(&current->run_list);
1998+ if (list_empty(array->queue + current->prio))
1999+ __clear_bit(current->prio, array->bitmap);
2000+ current->prio++;
2001+ list_add_tail(&current->run_list, array->queue + current->prio);
2002+ __set_bit(current->prio, array->bitmap);
2003+ }
2004+ /*
2005+ * Context-switch manually. This is equivalent to
2006+ * calling schedule(), but faster, because yield()
2007+ * knows lots of things that can be optimized away
2008+ * from the generic scheduler path:
2009+ */
2010+ queue = array->queue + sched_find_first_bit(array->bitmap);
2011+ next = list_entry(queue->next, task_t, run_list);
2012+ prefetch(next);
2013
2014- int nr_pending = nr_running;
2015-
2016-#if CONFIG_SMP
2017- int i;
2018-
2019- // Subtract non-idle processes running on other CPUs.
2020- for (i = 0; i < smp_num_cpus; i++) {
2021- int cpu = cpu_logical_map(i);
2022- if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
2023- nr_pending--;
2024+ prev->need_resched = 0;
2025+ if (likely(prev != next)) {
2026+ rq->nr_switches++;
2027+ rq->curr = next;
2028+ context_switch(prev, next);
2029+ barrier();
2030+ rq = this_rq();
2031 }
2032-#else
2033- // on UP this process is on the runqueue as well
2034- nr_pending--;
2035-#endif
2036- if (nr_pending) {
2037- /*
2038- * This process can only be rescheduled by us,
2039- * so this is safe without any locking.
2040- */
2041- if (current->policy == SCHED_OTHER)
2042- current->policy |= SCHED_YIELD;
2043- current->need_resched = 1;
2044+ spin_unlock_irq(&rq->lock);
2045+
2046+ reacquire_kernel_lock(current);
2047
2048- spin_lock_irq(&runqueue_lock);
2049- move_last_runqueue(current);
2050- spin_unlock_irq(&runqueue_lock);
2051- }
2052 return 0;
2053 }
2054
2055@@ -1105,7 +1324,7 @@
2056 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
2057 {
2058 struct timespec t;
2059- struct task_struct *p;
2060+ task_t *p;
2061 int retval = -EINVAL;
2062
2063 if (pid < 0)
2064@@ -1115,8 +1334,8 @@
2065 read_lock(&tasklist_lock);
2066 p = find_process_by_pid(pid);
2067 if (p)
2068- jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice),
2069- &t);
2070+ jiffies_to_timespec(p->policy & SCHED_FIFO ?
2071+ 0 : TASK_TIMESLICE(p), &t);
2072 read_unlock(&tasklist_lock);
2073 if (p)
2074 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
2075@@ -1124,14 +1343,14 @@
2076 return retval;
2077 }
2078
2079-static void show_task(struct task_struct * p)
2080+static void show_task(task_t * p)
2081 {
2082 unsigned long free = 0;
2083 int state;
2084 static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
2085
2086 printk("%-13.13s ", p->comm);
2087- state = p->state ? ffz(~p->state) + 1 : 0;
2088+ state = p->state ? __ffs(p->state) + 1 : 0;
2089 if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
2090 printk(stat_nam[state]);
2091 else
2092@@ -1172,7 +1391,7 @@
2093 printk(" (NOTLB)\n");
2094
2095 {
2096- extern void show_trace_task(struct task_struct *tsk);
2097+ extern void show_trace_task(task_t *tsk);
2098 show_trace_task(p);
2099 }
2100 }
2101@@ -1194,7 +1413,7 @@
2102
2103 void show_state(void)
2104 {
2105- struct task_struct *p;
2106+ task_t *p;
2107
2108 #if (BITS_PER_LONG == 32)
2109 printk("\n"
2110@@ -1217,121 +1436,88 @@
2111 read_unlock(&tasklist_lock);
2112 }
2113
2114-/**
2115- * reparent_to_init() - Reparent the calling kernel thread to the init task.
2116- *
2117- * If a kernel thread is launched as a result of a system call, or if
2118- * it ever exits, it should generally reparent itself to init so that
2119- * it is correctly cleaned up on exit.
2120- *
2121- * The various task state such as scheduling policy and priority may have
2122- * been inherited fro a user process, so we reset them to sane values here.
2123- *
2124- * NOTE that reparent_to_init() gives the caller full capabilities.
2125- */
2126-void reparent_to_init(void)
2127+static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
2128 {
2129- struct task_struct *this_task = current;
2130-
2131- write_lock_irq(&tasklist_lock);
2132-
2133- /* Reparent to init */
2134- REMOVE_LINKS(this_task);
2135- this_task->p_pptr = child_reaper;
2136- this_task->p_opptr = child_reaper;
2137- SET_LINKS(this_task);
2138-
2139- /* Set the exit signal to SIGCHLD so we signal init on exit */
2140- this_task->exit_signal = SIGCHLD;
2141-
2142- /* We also take the runqueue_lock while altering task fields
2143- * which affect scheduling decisions */
2144- spin_lock(&runqueue_lock);
2145-
2146- this_task->ptrace = 0;
2147- this_task->nice = DEF_NICE;
2148- this_task->policy = SCHED_OTHER;
2149- /* cpus_allowed? */
2150- /* rt_priority? */
2151- /* signals? */
2152- this_task->cap_effective = CAP_INIT_EFF_SET;
2153- this_task->cap_inheritable = CAP_INIT_INH_SET;
2154- this_task->cap_permitted = CAP_FULL_SET;
2155- this_task->keep_capabilities = 0;
2156- memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim)));
2157- this_task->user = INIT_USER;
2158-
2159- spin_unlock(&runqueue_lock);
2160- write_unlock_irq(&tasklist_lock);
2161+ if (rq1 == rq2)
2162+ spin_lock(&rq1->lock);
2163+ else {
2164+ if (rq1 < rq2) {
2165+ spin_lock(&rq1->lock);
2166+ spin_lock(&rq2->lock);
2167+ } else {
2168+ spin_lock(&rq2->lock);
2169+ spin_lock(&rq1->lock);
2170+ }
2171+ }
2172 }
2173
2174-/*
2175- * Put all the gunge required to become a kernel thread without
2176- * attached user resources in one place where it belongs.
2177- */
2178-
2179-void daemonize(void)
2180+static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
2181 {
2182- struct fs_struct *fs;
2183-
2184-
2185- /*
2186- * If we were started as result of loading a module, close all of the
2187- * user space pages. We don't need them, and if we didn't close them
2188- * they would be locked into memory.
2189- */
2190- exit_mm(current);
2191-
2192- current->session = 1;
2193- current->pgrp = 1;
2194- current->tty = NULL;
2195-
2196- /* Become as one with the init task */
2197-
2198- exit_fs(current); /* current->fs->count--; */
2199- fs = init_task.fs;
2200- current->fs = fs;
2201- atomic_inc(&fs->count);
2202- exit_files(current);
2203- current->files = init_task.files;
2204- atomic_inc(&current->files->count);
2205+ spin_unlock(&rq1->lock);
2206+ if (rq1 != rq2)
2207+ spin_unlock(&rq2->lock);
2208 }
2209
2210-extern unsigned long wait_init_idle;
2211-
2212-void __init init_idle(void)
2213+void __init init_idle(task_t *idle, int cpu)
2214 {
2215- struct schedule_data * sched_data;
2216- sched_data = &aligned_data[smp_processor_id()].schedule_data;
2217+ runqueue_t *idle_rq = cpu_rq(cpu), *rq = idle->array->rq;
2218+ unsigned long flags;
2219
2220- if (current != &init_task && task_on_runqueue(current)) {
2221- printk("UGH! (%d:%d) was on the runqueue, removing.\n",
2222- smp_processor_id(), current->pid);
2223- del_from_runqueue(current);
2224- }
2225- sched_data->curr = current;
2226- sched_data->last_schedule = get_cycles();
2227- clear_bit(current->processor, &wait_init_idle);
2228+ __save_flags(flags);
2229+ __cli();
2230+ double_rq_lock(idle_rq, rq);
2231+
2232+ idle_rq->curr = idle_rq->idle = idle;
2233+ deactivate_task(idle, rq);
2234+ idle->array = NULL;
2235+ idle->prio = MAX_PRIO;
2236+ idle->state = TASK_RUNNING;
2237+ idle->cpu = cpu;
2238+ double_rq_unlock(idle_rq, rq);
2239+ idle->need_resched = 1;
2240+ __restore_flags(flags);
2241 }
2242
2243-extern void init_timervecs (void);
2244+extern void init_timervecs(void);
2245+extern void timer_bh(void);
2246+extern void tqueue_bh(void);
2247+extern void immediate_bh(void);
2248
2249 void __init sched_init(void)
2250 {
2251+ runqueue_t *rq;
2252+ int i, j, k;
2253+
2254+ for (i = 0; i < NR_CPUS; i++) {
2255+ runqueue_t *rq = cpu_rq(i);
2256+ prio_array_t *array;
2257+
2258+ rq->active = rq->arrays + 0;
2259+ rq->expired = rq->arrays + 1;
2260+ spin_lock_init(&rq->lock);
2261+
2262+ for (j = 0; j < 2; j++) {
2263+ array = rq->arrays + j;
2264+ array->rq = rq;
2265+ array->lock = &rq->lock;
2266+ for (k = 0; k < MAX_PRIO; k++) {
2267+ INIT_LIST_HEAD(array->queue + k);
2268+ __clear_bit(k, array->bitmap);
2269+ }
2270+ // delimiter for bitsearch
2271+ __set_bit(MAX_PRIO, array->bitmap);
2272+ }
2273+ }
2274 /*
2275 * We have to do a little magic to get the first
2276 * process right in SMP mode.
2277 */
2278- int cpu = smp_processor_id();
2279- int nr;
2280-
2281- init_task.processor = cpu;
2282-
2283- for(nr = 0; nr < PIDHASH_SZ; nr++)
2284- pidhash[nr] = NULL;
2285+ rq = this_rq();
2286+ rq->curr = current;
2287+ rq->idle = current;
2288+ wake_up_process(current);
2289
2290 init_timervecs();
2291-
2292 init_bh(TIMER_BH, timer_bh);
2293 init_bh(TQUEUE_BH, tqueue_bh);
2294 init_bh(IMMEDIATE_BH, immediate_bh);
2295@@ -1340,5 +1526,5 @@
2296 * The boot idle thread does lazy MMU switching as well:
2297 */
2298 atomic_inc(&init_mm.mm_count);
2299- enter_lazy_tlb(&init_mm, current, cpu);
2300+ enter_lazy_tlb(&init_mm, current, smp_processor_id());
2301 }
2302--- linux/kernel/exit.c.orig Tue Feb 5 13:51:53 2002
2303+++ linux/kernel/exit.c Tue Feb 5 13:52:12 2002
2304@@ -27,49 +27,22 @@
2305
2306 static void release_task(struct task_struct * p)
2307 {
2308- if (p != current) {
2309+ if (p == current)
2310+ BUG();
2311 #ifdef CONFIG_SMP
2312- /*
2313- * Wait to make sure the process isn't on the
2314- * runqueue (active on some other CPU still)
2315- */
2316- for (;;) {
2317- task_lock(p);
2318- if (!task_has_cpu(p))
2319- break;
2320- task_unlock(p);
2321- do {
2322- cpu_relax();
2323- barrier();
2324- } while (task_has_cpu(p));
2325- }
2326- task_unlock(p);
2327+ wait_task_inactive(p);
2328 #endif
2329- atomic_dec(&p->user->processes);
2330- free_uid(p->user);
2331- unhash_process(p);
2332-
2333- release_thread(p);
2334- current->cmin_flt += p->min_flt + p->cmin_flt;
2335- current->cmaj_flt += p->maj_flt + p->cmaj_flt;
2336- current->cnswap += p->nswap + p->cnswap;
2337- /*
2338- * Potentially available timeslices are retrieved
2339- * here - this way the parent does not get penalized
2340- * for creating too many processes.
2341- *
2342- * (this cannot be used to artificially 'generate'
2343- * timeslices, because any timeslice recovered here
2344- * was given away by the parent in the first place.)
2345- */
2346- current->counter += p->counter;
2347- if (current->counter >= MAX_COUNTER)
2348- current->counter = MAX_COUNTER;
2349- p->pid = 0;
2350- free_task_struct(p);
2351- } else {
2352- printk("task releasing itself\n");
2353- }
2354+ atomic_dec(&p->user->processes);
2355+ free_uid(p->user);
2356+ unhash_process(p);
2357+
2358+ release_thread(p);
2359+ current->cmin_flt += p->min_flt + p->cmin_flt;
2360+ current->cmaj_flt += p->maj_flt + p->cmaj_flt;
2361+ current->cnswap += p->nswap + p->cnswap;
2362+ sched_exit(p);
2363+ p->pid = 0;
2364+ free_task_struct(p);
2365 }
2366
2367 /*
2368@@ -147,6 +120,79 @@
2369 }
2370 read_unlock(&tasklist_lock);
2371 return retval;
2372+}
2373+
2374+/**
2375+ * reparent_to_init() - Reparent the calling kernel thread to the init task.
2376+ *
2377+ * If a kernel thread is launched as a result of a system call, or if
2378+ * it ever exits, it should generally reparent itself to init so that
2379+ * it is correctly cleaned up on exit.
2380+ *
2381+ * The various task state such as scheduling policy and priority may have
2382+ * been inherited from a user process, so we reset them to sane values here.
2383+ *
2384+ * NOTE that reparent_to_init() gives the caller full capabilities.
2385+ */
2386+void reparent_to_init(void)
2387+{
2388+ write_lock_irq(&tasklist_lock);
2389+
2390+ /* Reparent to init */
2391+ REMOVE_LINKS(current);
2392+ current->p_pptr = child_reaper;
2393+ current->p_opptr = child_reaper;
2394+ SET_LINKS(current);
2395+
2396+ /* Set the exit signal to SIGCHLD so we signal init on exit */
2397+ current->exit_signal = SIGCHLD;
2398+
2399+ current->ptrace = 0;
2400+ if ((current->policy == SCHED_OTHER) && (task_nice(current) < 0))
2401+ set_user_nice(current, 0);
2402+ /* cpus_allowed? */
2403+ /* rt_priority? */
2404+ /* signals? */
2405+ current->cap_effective = CAP_INIT_EFF_SET;
2406+ current->cap_inheritable = CAP_INIT_INH_SET;
2407+ current->cap_permitted = CAP_FULL_SET;
2408+ current->keep_capabilities = 0;
2409+ memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
2410+ current->user = INIT_USER;
2411+
2412+ write_unlock_irq(&tasklist_lock);
2413+}
2414+
2415+/*
2416+ * Put all the gunge required to become a kernel thread without
2417+ * attached user resources in one place where it belongs.
2418+ */
2419+
2420+void daemonize(void)
2421+{
2422+ struct fs_struct *fs;
2423+
2424+
2425+ /*
2426+ * If we were started as result of loading a module, close all of the
2427+ * user space pages. We don't need them, and if we didn't close them
2428+ * they would be locked into memory.
2429+ */
2430+ exit_mm(current);
2431+
2432+ current->session = 1;
2433+ current->pgrp = 1;
2434+ current->tty = NULL;
2435+
2436+ /* Become as one with the init task */
2437+
2438+ exit_fs(current); /* current->fs->count--; */
2439+ fs = init_task.fs;
2440+ current->fs = fs;
2441+ atomic_inc(&fs->count);
2442+ exit_files(current);
2443+ current->files = init_task.files;
2444+ atomic_inc(&current->files->count);
2445 }
2446
2447 /*
2448--- linux/kernel/capability.c.orig Sat Jun 24 06:06:37 2000
2449+++ linux/kernel/capability.c Tue Feb 5 13:52:12 2002
2450@@ -8,6 +8,8 @@
2451 #include <linux/mm.h>
2452 #include <asm/uaccess.h>
2453
2454+unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
2455+
2456 kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
2457
2458 /* Note: never hold tasklist_lock while spinning for this one */
2459--- linux/kernel/timer.c.orig Tue Feb 5 13:51:43 2002
2460+++ linux/kernel/timer.c Tue Feb 5 13:52:12 2002
2461@@ -25,6 +25,8 @@
2462
2463 #include <asm/uaccess.h>
2464
2465+struct kernel_stat kstat;
2466+
2467 /*
2468 * Timekeeping variables
2469 */
2470@@ -582,18 +584,7 @@
2471 int cpu = smp_processor_id(), system = user_tick ^ 1;
2472
2473 update_one_process(p, user_tick, system, cpu);
2474- if (p->pid) {
2475- if (--p->counter <= 0) {
2476- p->counter = 0;
2477- p->need_resched = 1;
2478- }
2479- if (p->nice > 0)
2480- kstat.per_cpu_nice[cpu] += user_tick;
2481- else
2482- kstat.per_cpu_user[cpu] += user_tick;
2483- kstat.per_cpu_system[cpu] += system;
2484- } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
2485- kstat.per_cpu_system[cpu] += system;
2486+ scheduler_tick(user_tick, system);
2487 }
2488
2489 /*
2490@@ -794,6 +785,89 @@
2491
2492 #endif
2493
2494+static void process_timeout(unsigned long __data)
2495+{
2496+ wake_up_process((task_t *)__data);
2497+}
2498+
2499+/**
2500+ * schedule_timeout - sleep until timeout
2501+ * @timeout: timeout value in jiffies
2502+ *
2503+ * Make the current task sleep until @timeout jiffies have
2504+ * elapsed. The routine will return immediately unless
2505+ * the current task state has been set (see set_current_state()).
2506+ *
2507+ * You can set the task state as follows -
2508+ *
2509+ * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
2510+ * pass before the routine returns. The routine will return 0
2511+ *
2512+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
2513+ * delivered to the current task. In this case the remaining time
2514+ * in jiffies will be returned, or 0 if the timer expired in time
2515+ *
2516+ * The current task state is guaranteed to be TASK_RUNNING when this
2517+ * routine returns.
2518+ *
2519+ * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
2520+ * the CPU away without a bound on the timeout. In this case the return
2521+ * value will be %MAX_SCHEDULE_TIMEOUT.
2522+ *
2523+ * In all cases the return value is guaranteed to be non-negative.
2524+ */
2525+signed long schedule_timeout(signed long timeout)
2526+{
2527+ struct timer_list timer;
2528+ unsigned long expire;
2529+
2530+ switch (timeout)
2531+ {
2532+ case MAX_SCHEDULE_TIMEOUT:
2533+ /*
2534+ * These two special cases are useful to be comfortable
2535+ * in the caller. Nothing more. We could take
2536+ * MAX_SCHEDULE_TIMEOUT from one of the negative value
2537+ * but I' d like to return a valid offset (>=0) to allow
2538+ * the caller to do everything it want with the retval.
2539+ */
2540+ schedule();
2541+ goto out;
2542+ default:
2543+ /*
2544+ * Another bit of PARANOID. Note that the retval will be
2545+ * 0 since no piece of kernel is supposed to do a check
2546+ * for a negative retval of schedule_timeout() (since it
2547+ * should never happens anyway). You just have the printk()
2548+ * that will tell you if something is gone wrong and where.
2549+ */
2550+ if (timeout < 0)
2551+ {
2552+ printk(KERN_ERR "schedule_timeout: wrong timeout "
2553+ "value %lx from %p\n", timeout,
2554+ __builtin_return_address(0));
2555+ current->state = TASK_RUNNING;
2556+ goto out;
2557+ }
2558+ }
2559+
2560+ expire = timeout + jiffies;
2561+
2562+ init_timer(&timer);
2563+ timer.expires = expire;
2564+ timer.data = (unsigned long) current;
2565+ timer.function = process_timeout;
2566+
2567+ add_timer(&timer);
2568+ schedule();
2569+ del_timer_sync(&timer);
2570+
2571+ timeout = expire - jiffies;
2572+
2573+ out:
2574+ return timeout < 0 ? 0 : timeout;
2575+}
2576+
2577 /* Thread ID - the internal kernel "pid" */
2578 asmlinkage long sys_gettid(void)
2579 {
2580@@ -840,4 +914,3 @@
2581 }
2582 return 0;
2583 }
2584-
2585--- linux/kernel/fork.c.orig Tue Feb 5 13:51:53 2002
2586+++ linux/kernel/fork.c Tue Feb 5 13:52:12 2002
2587@@ -28,7 +28,6 @@
2588
2589 /* The idle threads do not count.. */
2590 int nr_threads;
2591-int nr_running;
2592
2593 int max_threads;
2594 unsigned long total_forks; /* Handle normal Linux uptimes. */
2595@@ -36,6 +35,8 @@
2596
2597 struct task_struct *pidhash[PIDHASH_SZ];
2598
2599+rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */
2600+
2601 void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
2602 {
2603 unsigned long flags;
2604@@ -564,6 +565,7 @@
2605 struct pt_regs *regs, unsigned long stack_size)
2606 {
2607 int retval;
2608+ unsigned long flags;
2609 struct task_struct *p;
2610 struct completion vfork;
2611
2612@@ -619,8 +621,7 @@
2613 copy_flags(clone_flags, p);
2614 p->pid = get_pid(clone_flags);
2615
2616- p->run_list.next = NULL;
2617- p->run_list.prev = NULL;
2618+ INIT_LIST_HEAD(&p->run_list);
2619
2620 p->p_cptr = NULL;
2621 init_waitqueue_head(&p->wait_chldexit);
2622@@ -646,14 +647,15 @@
2623 #ifdef CONFIG_SMP
2624 {
2625 int i;
2626- p->cpus_runnable = ~0UL;
2627- p->processor = current->processor;
2628+
2629 /* ?? should we just memset this ?? */
2630 for(i = 0; i < smp_num_cpus; i++)
2631- p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
2632+ p->per_cpu_utime[cpu_logical_map(i)] =
2633+ p->per_cpu_stime[cpu_logical_map(i)] = 0;
2634 spin_lock_init(&p->sigmask_lock);
2635 }
2636 #endif
2637+ p->array = NULL;
2638 p->lock_depth = -1; /* -1 = no lock */
2639 p->start_time = jiffies;
2640
2641@@ -685,15 +687,27 @@
2642 p->pdeath_signal = 0;
2643
2644 /*
2645- * "share" dynamic priority between parent and child, thus the
2646- * total amount of dynamic priorities in the system doesnt change,
2647- * more scheduling fairness. This is only important in the first
2648- * timeslice, on the long run the scheduling behaviour is unchanged.
2649- */
2650- p->counter = (current->counter + 1) >> 1;
2651- current->counter >>= 1;
2652- if (!current->counter)
2653- current->need_resched = 1;
2654+ * Share the timeslice between parent and child, thus the
2655+ * total amount of pending timeslices in the system doesnt change,
2656+ * resulting in more scheduling fairness.
2657+ */
2658+ __save_flags(flags);
2659+ __cli();
2660+ if (!current->time_slice)
2661+ BUG();
2662+ p->time_slice = (current->time_slice + 1) >> 1;
2663+ current->time_slice >>= 1;
2664+ if (!current->time_slice) {
2665+ /*
2666+ * This case is rare, it happens when the parent has only
2667+ * a single jiffy left from its timeslice. Taking the
2668+ * runqueue lock is not a problem.
2669+ */
2670+ current->time_slice = 1;
2671+ scheduler_tick(0,0);
2672+ }
2673+ p->sleep_timestamp = jiffies;
2674+ __restore_flags(flags);
2675
2676 /*
2677 * Ok, add it to the run-queues and make it
2678@@ -730,10 +744,23 @@
2679 if (p->ptrace & PT_PTRACED)
2680 send_sig(SIGSTOP, p, 1);
2681
2682+#define RUN_CHILD_FIRST 1
2683+#if RUN_CHILD_FIRST
2684+ wake_up_forked_process(p); /* do this last */
2685+#else
2686 wake_up_process(p); /* do this last */
2687+#endif
2688 ++total_forks;
2689 if (clone_flags & CLONE_VFORK)
2690 wait_for_completion(&vfork);
2691+#if RUN_CHILD_FIRST
2692+ else
2693+ /*
2694+ * Let the child process run first, to avoid most of the
2695+ * COW overhead when the child exec()s afterwards.
2696+ */
2697+ current->need_resched = 1;
2698+#endif
2699
2700 fork_out:
2701 return retval;
2702--- linux/kernel/softirq.c.orig Tue Feb 5 13:51:47 2002
2703+++ linux/kernel/softirq.c Tue Feb 5 13:52:12 2002
2704@@ -259,10 +259,9 @@
2705
2706 while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
2707 current->state = TASK_RUNNING;
2708- do {
2709- current->policy |= SCHED_YIELD;
2710- schedule();
2711- } while (test_bit(TASKLET_STATE_SCHED, &t->state));
2712+ do
2713+ sys_sched_yield();
2714+ while (test_bit(TASKLET_STATE_SCHED, &t->state));
2715 }
2716 tasklet_unlock_wait(t);
2717 clear_bit(TASKLET_STATE_SCHED, &t->state);
2718@@ -365,13 +364,13 @@
2719 int cpu = cpu_logical_map(bind_cpu);
2720
2721 daemonize();
2722- current->nice = 19;
2723+ set_user_nice(current, 19);
2724 sigfillset(&current->blocked);
2725
2726 /* Migrate to the right CPU */
2727- current->cpus_allowed = 1UL << cpu;
2728- while (smp_processor_id() != cpu)
2729- schedule();
2730+ set_cpus_allowed(current, 1UL << cpu);
2731+ if (cpu() != cpu)
2732+ BUG();
2733
2734 sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu);
2735
2736@@ -396,7 +395,7 @@
2737 }
2738 }
2739
2740-static __init int spawn_ksoftirqd(void)
2741+__init int spawn_ksoftirqd(void)
2742 {
2743 int cpu;
2744
2745@@ -405,14 +404,12 @@
2746 CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0)
2747 printk("spawn_ksoftirqd() failed for cpu %d\n", cpu);
2748 else {
2749- while (!ksoftirqd_task(cpu_logical_map(cpu))) {
2750- current->policy |= SCHED_YIELD;
2751- schedule();
2752- }
2753+ while (!ksoftirqd_task(cpu_logical_map(cpu)))
2754+ sys_sched_yield();
2755 }
2756 }
2757
2758 return 0;
2759 }
2760
2761-__initcall(spawn_ksoftirqd);
2762+__initcall(spawn_ksoftirqd);
2763--- linux/kernel/ptrace.c.orig Tue Feb 5 13:51:53 2002
2764+++ linux/kernel/ptrace.c Tue Feb 5 13:52:12 2002
2765@@ -31,20 +31,7 @@
2766 if (child->state != TASK_STOPPED)
2767 return -ESRCH;
2768 #ifdef CONFIG_SMP
2769- /* Make sure the child gets off its CPU.. */
2770- for (;;) {
2771- task_lock(child);
2772- if (!task_has_cpu(child))
2773- break;
2774- task_unlock(child);
2775- do {
2776- if (child->state != TASK_STOPPED)
2777- return -ESRCH;
2778- barrier();
2779- cpu_relax();
2780- } while (task_has_cpu(child));
2781- }
2782- task_unlock(child);
2783+ wait_task_inactive(child);
2784 #endif
2785 }
2786
2787--- linux/kernel/sys.c.orig Tue Feb 5 13:51:53 2002
2788+++ linux/kernel/sys.c Tue Feb 5 13:52:12 2002
2789@@ -220,10 +220,10 @@
2790 }
2791 if (error == -ESRCH)
2792 error = 0;
2793- if (niceval < p->nice && !capable(CAP_SYS_NICE))
2794+ if (niceval < task_nice(p) && !capable(CAP_SYS_NICE))
2795 error = -EACCES;
2796 else
2797- p->nice = niceval;
2798+ set_user_nice(p, niceval);
2799 }
2800 read_unlock(&tasklist_lock);
2801
2802@@ -249,7 +249,7 @@
2803 long niceval;
2804 if (!proc_sel(p, which, who))
2805 continue;
2806- niceval = 20 - p->nice;
2807+ niceval = 20 - task_nice(p);
2808 if (niceval > retval)
2809 retval = niceval;
2810 }
2811--- linux/kernel/signal.c.orig Tue Feb 5 13:51:49 2002
2812+++ linux/kernel/signal.c Tue Feb 5 13:52:12 2002
2813@@ -478,12 +478,9 @@
2814 * process of changing - but no harm is done by that
2815 * other than doing an extra (lightweight) IPI interrupt.
2816 */
2817- spin_lock(&runqueue_lock);
2818- if (task_has_cpu(t) && t->processor != smp_processor_id())
2819- smp_send_reschedule(t->processor);
2820- spin_unlock(&runqueue_lock);
2821-#endif /* CONFIG_SMP */
2822-
2823+ if ((t->state == TASK_RUNNING) && (t->cpu != cpu()))
2824+ kick_if_running(t);
2825+#endif
2826 if (t->state & TASK_INTERRUPTIBLE) {
2827 wake_up_process(t);
2828 return;
2829--- linux/kernel/printk.c.orig Tue Feb 5 13:51:53 2002
2830+++ linux/kernel/printk.c Tue Feb 5 13:52:12 2002
2831@@ -26,6 +26,7 @@
2832 #include <linux/module.h>
2833 #include <linux/interrupt.h> /* For in_interrupt() */
2834 #include <linux/config.h>
2835+#include <linux/delay.h>
2836
2837 #include <asm/uaccess.h>
2838
2839--- linux/kernel/ksyms.c.orig Tue Feb 5 13:51:53 2002
2840+++ linux/kernel/ksyms.c Tue Feb 5 13:52:12 2002
2841@@ -437,6 +437,9 @@
2842 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
2843 EXPORT_SYMBOL(schedule);
2844 EXPORT_SYMBOL(schedule_timeout);
2845+EXPORT_SYMBOL(sys_sched_yield);
2846+EXPORT_SYMBOL(set_user_nice);
2847+EXPORT_SYMBOL(set_cpus_allowed);
2848 EXPORT_SYMBOL(jiffies);
2849 EXPORT_SYMBOL(xtime);
2850 EXPORT_SYMBOL(do_gettimeofday);
2851@@ -448,6 +451,7 @@
2852
2853 EXPORT_SYMBOL(kstat);
2854 EXPORT_SYMBOL(nr_running);
2855+EXPORT_SYMBOL(nr_context_switches);
2856
2857 /* misc */
2858 EXPORT_SYMBOL(panic);
2859--- linux/mm/oom_kill.c.orig Tue Feb 5 13:51:47 2002
2860+++ linux/mm/oom_kill.c Tue Feb 5 13:52:12 2002
2861@@ -82,7 +82,7 @@
2862 * Niced processes are most likely less important, so double
2863 * their badness points.
2864 */
2865- if (p->nice > 0)
2866+ if (task_nice(p) > 0)
2867 points *= 2;
2868
2869 /*
2870@@ -149,7 +149,7 @@
2871 * all the memory it needs. That way it should be able to
2872 * exit() and clear out its resources quickly...
2873 */
2874- p->counter = 5 * HZ;
2875+ p->time_slice = HZ;
2876 p->flags |= PF_MEMALLOC | PF_MEMDIE;
2877
2878 /* This process has hardware access, be more careful. */
2879@@ -188,8 +188,7 @@
2880 * killing itself before someone else gets the chance to ask
2881 * for more memory.
2882 */
2883- current->policy |= SCHED_YIELD;
2884- schedule();
2885+ yield();
2886 return;
2887 }
2888
2889--- linux/mm/page_alloc.c.orig Tue Feb 5 13:51:53 2002
2890+++ linux/mm/page_alloc.c Tue Feb 5 13:52:12 2002
2891@@ -400,9 +400,8 @@
2892 return NULL;
2893
2894 /* Yield for kswapd, and try again */
2895- current->policy |= SCHED_YIELD;
2896 __set_current_state(TASK_RUNNING);
2897- schedule();
2898+ yield();
2899 goto rebalance;
2900 }
2901
2902--- linux/mm/highmem.c.orig Tue Feb 5 13:51:51 2002
2903+++ linux/mm/highmem.c Tue Feb 5 13:52:12 2002
2904@@ -354,9 +354,8 @@
2905 /* we need to wait I/O completion */
2906 run_task_queue(&tq_disk);
2907
2908- current->policy |= SCHED_YIELD;
2909 __set_current_state(TASK_RUNNING);
2910- schedule();
2911+ yield();
2912 goto repeat_alloc;
2913 }
2914
2915@@ -392,9 +391,8 @@
2916 /* we need to wait I/O completion */
2917 run_task_queue(&tq_disk);
2918
2919- current->policy |= SCHED_YIELD;
2920 __set_current_state(TASK_RUNNING);
2921- schedule();
2922+ yield();
2923 goto repeat_alloc;
2924 }
2925
2926--- linux/include/linux/sched.h.orig Tue Feb 5 13:51:51 2002
2927+++ linux/include/linux/sched.h Tue Feb 5 13:52:12 2002
2928@@ -6,6 +6,7 @@
2929 extern unsigned long event;
2930
2931 #include <linux/config.h>
2932+#include <linux/compiler.h>
2933 #include <linux/binfmts.h>
2934 #include <linux/threads.h>
2935 #include <linux/kernel.h>
2936@@ -42,6 +43,7 @@
2937 #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
2938 #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
2939 #define CLONE_THREAD 0x00010000 /* Same thread group? */
2940+#define CLONE_NEWNS 0x00020000 /* New namespace group? */
2941
2942 #define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD)
2943
2944@@ -72,8 +74,9 @@
2945 #define CT_TO_SECS(x) ((x) / HZ)
2946 #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ)
2947
2948-extern int nr_running, nr_threads;
2949+extern int nr_threads;
2950 extern int last_pid;
2951+extern unsigned long nr_running(void);
2952
2953 #include <linux/fs.h>
2954 #include <linux/time.h>
2955@@ -116,12 +119,6 @@
2956 #define SCHED_FIFO 1
2957 #define SCHED_RR 2
2958
2959-/*
2960- * This is an additional bit set when we want to
2961- * yield the CPU for one re-schedule..
2962- */
2963-#define SCHED_YIELD 0x10
2964-
2965 struct sched_param {
2966 int sched_priority;
2967 };
2968@@ -139,17 +136,22 @@
2969 * a separate lock).
2970 */
2971 extern rwlock_t tasklist_lock;
2972-extern spinlock_t runqueue_lock;
2973 extern spinlock_t mmlist_lock;
2974
2975+typedef struct task_struct task_t;
2976+
2977 extern void sched_init(void);
2978-extern void init_idle(void);
2979+extern void init_idle(task_t *idle, int cpu);
2980 extern void show_state(void);
2981 extern void cpu_init (void);
2982 extern void trap_init(void);
2983 extern void update_process_times(int user);
2984-extern void update_one_process(struct task_struct *p, unsigned long user,
2985+extern void update_one_process(task_t *p, unsigned long user,
2986 unsigned long system, int cpu);
2987+extern void scheduler_tick(int user_tick, int system);
2988+extern void sched_task_migrated(task_t *p);
2989+extern void smp_migrate_task(int cpu, task_t *task);
2990+extern unsigned long cache_decay_ticks;
2991
2992 #define MAX_SCHEDULE_TIMEOUT LONG_MAX
2993 extern signed long FASTCALL(schedule_timeout(signed long timeout));
2994@@ -166,6 +168,7 @@
2995 */
2996 #define NR_OPEN_DEFAULT BITS_PER_LONG
2997
2998+struct namespace;
2999 /*
3000 * Open file table structure
3001 */
3002@@ -278,6 +281,8 @@
3003 extern struct user_struct root_user;
3004 #define INIT_USER (&root_user)
3005
3006+typedef struct prio_array prio_array_t;
3007+
3008 struct task_struct {
3009 /*
3010 * offsets of these are hardcoded elsewhere - touch with care
3011@@ -295,35 +300,26 @@
3012
3013 int lock_depth; /* Lock depth */
3014
3015-/*
3016- * offset 32 begins here on 32-bit platforms. We keep
3017- * all fields in a single cacheline that are needed for
3018- * the goodness() loop in schedule().
3019- */
3020- long counter;
3021- long nice;
3022- unsigned long policy;
3023- struct mm_struct *mm;
3024- int processor;
3025 /*
3026- * cpus_runnable is ~0 if the process is not running on any
3027- * CPU. It's (1 << cpu) if it's running on a CPU. This mask
3028- * is updated under the runqueue lock.
3029- *
3030- * To determine whether a process might run on a CPU, this
3031- * mask is AND-ed with cpus_allowed.
3032+ * offset 32 begins here on 32-bit platforms.
3033 */
3034- unsigned long cpus_runnable, cpus_allowed;
3035- /*
3036- * (only the 'next' pointer fits into the cacheline, but
3037- * that's just fine.)
3038- */
3039- struct list_head run_list;
3040- unsigned long sleep_time;
3041+ unsigned int cpu;
3042+ int prio, static_prio;
3043+ list_t run_list;
3044+ prio_array_t *array;
3045+
3046+ unsigned long sleep_avg;
3047+ unsigned long sleep_timestamp;
3048+
3049+ unsigned long policy;
3050+ unsigned long cpus_allowed;
3051+ unsigned int time_slice;
3052+
3053+ task_t *next_task, *prev_task;
3054
3055- struct task_struct *next_task, *prev_task;
3056- struct mm_struct *active_mm;
3057+ struct mm_struct *mm, *active_mm;
3058 struct list_head local_pages;
3059+
3060 unsigned int allocation_order, nr_local_pages;
3061
3062 /* task state */
3063@@ -345,12 +341,12 @@
3064 * older sibling, respectively. (p->father can be replaced with
3065 * p->p_pptr->pid)
3066 */
3067- struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
3068+ task_t *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
3069 struct list_head thread_group;
3070
3071 /* PID hash table linkage. */
3072- struct task_struct *pidhash_next;
3073- struct task_struct **pidhash_pprev;
3074+ task_t *pidhash_next;
3075+ task_t **pidhash_pprev;
3076
3077 wait_queue_head_t wait_chldexit; /* for wait4() */
3078 struct completion *vfork_done; /* for vfork() */
3079@@ -389,6 +385,8 @@
3080 struct fs_struct *fs;
3081 /* open file information */
3082 struct files_struct *files;
3083+/* namespace */
3084+ struct namespace *namespace;
3085 /* signal handlers */
3086 spinlock_t sigmask_lock; /* Protects signal and blocked */
3087 struct signal_struct *sig;
3088@@ -446,10 +444,13 @@
3089 */
3090 #define _STK_LIM (8*1024*1024)
3091
3092-#define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */
3093-#define MAX_COUNTER (20*HZ/100)
3094-#define DEF_NICE (0)
3095+extern void set_cpus_allowed(task_t *p, unsigned long new_mask);
3096+extern void set_user_nice(task_t *p, long nice);
3097+extern int task_prio(task_t *p);
3098+extern int task_nice(task_t *p);
3099
3100+asmlinkage long sys_sched_yield(void);
3101+#define yield() sys_sched_yield()
3102
3103 /*
3104 * The default (Linux) execution domain.
3105@@ -468,14 +469,14 @@
3106 addr_limit: KERNEL_DS, \
3107 exec_domain: &default_exec_domain, \
3108 lock_depth: -1, \
3109- counter: DEF_COUNTER, \
3110- nice: DEF_NICE, \
3111+ prio: 120, \
3112+ static_prio: 120, \
3113 policy: SCHED_OTHER, \
3114+ cpus_allowed: -1, \
3115 mm: NULL, \
3116 active_mm: &init_mm, \
3117- cpus_runnable: -1, \
3118- cpus_allowed: -1, \
3119 run_list: LIST_HEAD_INIT(tsk.run_list), \
3120+ time_slice: HZ, \
3121 next_task: &tsk, \
3122 prev_task: &tsk, \
3123 p_opptr: &tsk, \
3124@@ -509,24 +510,24 @@
3125 #endif
3126
3127 union task_union {
3128- struct task_struct task;
3129+ task_t task;
3130 unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
3131 };
3132
3133 extern union task_union init_task_union;
3134
3135 extern struct mm_struct init_mm;
3136-extern struct task_struct *init_tasks[NR_CPUS];
3137+extern task_t *init_tasks[NR_CPUS];
3138
3139 /* PID hashing. (shouldnt this be dynamic?) */
3140 #define PIDHASH_SZ (4096 >> 2)
3141-extern struct task_struct *pidhash[PIDHASH_SZ];
3142+extern task_t *pidhash[PIDHASH_SZ];
3143
3144 #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
3145
3146-static inline void hash_pid(struct task_struct *p)
3147+static inline void hash_pid(task_t *p)
3148 {
3149- struct task_struct **htable = &pidhash[pid_hashfn(p->pid)];
3150+ task_t **htable = &pidhash[pid_hashfn(p->pid)];
3151
3152 if((p->pidhash_next = *htable) != NULL)
3153 (*htable)->pidhash_pprev = &p->pidhash_next;
3154@@ -534,16 +535,16 @@
3155 p->pidhash_pprev = htable;
3156 }
3157
3158-static inline void unhash_pid(struct task_struct *p)
3159+static inline void unhash_pid(task_t *p)
3160 {
3161 if(p->pidhash_next)
3162 p->pidhash_next->pidhash_pprev = p->pidhash_pprev;
3163 *p->pidhash_pprev = p->pidhash_next;
3164 }
3165
3166-static inline struct task_struct *find_task_by_pid(int pid)
3167+static inline task_t *find_task_by_pid(int pid)
3168 {
3169- struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)];
3170+ task_t *p, **htable = &pidhash[pid_hashfn(pid)];
3171
3172 for(p = *htable; p && p->pid != pid; p = p->pidhash_next)
3173 ;
3174@@ -551,19 +552,6 @@
3175 return p;
3176 }
3177
3178-#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL)
3179-
3180-static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu)
3181-{
3182- tsk->processor = cpu;
3183- tsk->cpus_runnable = 1UL << cpu;
3184-}
3185-
3186-static inline void task_release_cpu(struct task_struct *tsk)
3187-{
3188- tsk->cpus_runnable = ~0UL;
3189-}
3190-
3191 /* per-UID process charging. */
3192 extern struct user_struct * alloc_uid(uid_t);
3193 extern void free_uid(struct user_struct *);
3194@@ -590,7 +578,9 @@
3195 extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
3196 extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
3197 signed long timeout));
3198-extern int FASTCALL(wake_up_process(struct task_struct * tsk));
3199+extern int FASTCALL(wake_up_process(task_t * tsk));
3200+extern void FASTCALL(wake_up_forked_process(task_t * tsk));
3201+extern void FASTCALL(sched_exit(task_t * p));
3202
3203 #define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
3204 #define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
3205@@ -608,28 +598,28 @@
3206 extern int in_egroup_p(gid_t);
3207
3208 extern void proc_caches_init(void);
3209-extern void flush_signals(struct task_struct *);
3210-extern void flush_signal_handlers(struct task_struct *);
3211+extern void flush_signals(task_t *);
3212+extern void flush_signal_handlers(task_t *);
3213 extern int dequeue_signal(sigset_t *, siginfo_t *);
3214 extern void block_all_signals(int (*notifier)(void *priv), void *priv,
3215 sigset_t *mask);
3216 extern void unblock_all_signals(void);
3217-extern int send_sig_info(int, struct siginfo *, struct task_struct *);
3218-extern int force_sig_info(int, struct siginfo *, struct task_struct *);
3219+extern int send_sig_info(int, struct siginfo *, task_t *);
3220+extern int force_sig_info(int, struct siginfo *, task_t *);
3221 extern int kill_pg_info(int, struct siginfo *, pid_t);
3222 extern int kill_sl_info(int, struct siginfo *, pid_t);
3223 extern int kill_proc_info(int, struct siginfo *, pid_t);
3224-extern void notify_parent(struct task_struct *, int);
3225-extern void do_notify_parent(struct task_struct *, int);
3226-extern void force_sig(int, struct task_struct *);
3227-extern int send_sig(int, struct task_struct *, int);
3228+extern void notify_parent(task_t *, int);
3229+extern void do_notify_parent(task_t *, int);
3230+extern void force_sig(int, task_t *);
3231+extern int send_sig(int, task_t *, int);
3232 extern int kill_pg(pid_t, int, int);
3233 extern int kill_sl(pid_t, int, int);
3234 extern int kill_proc(pid_t, int, int);
3235 extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *);
3236 extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long);
3237
3238-static inline int signal_pending(struct task_struct *p)
3239+static inline int signal_pending(task_t *p)
3240 {
3241 return (p->sigpending != 0);
3242 }
3243@@ -668,7 +658,7 @@
3244 This is required every time the blocked sigset_t changes.
3245 All callers should have t->sigmask_lock. */
3246
3247-static inline void recalc_sigpending(struct task_struct *t)
3248+static inline void recalc_sigpending(task_t *t)
3249 {
3250 t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
3251 }
3252@@ -775,16 +765,17 @@
3253 extern int expand_fdset(struct files_struct *, int nr);
3254 extern void free_fdset(fd_set *, int);
3255
3256-extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
3257+extern int copy_thread(int, unsigned long, unsigned long, unsigned long, task_t *, struct pt_regs *);
3258 extern void flush_thread(void);
3259 extern void exit_thread(void);
3260
3261-extern void exit_mm(struct task_struct *);
3262-extern void exit_files(struct task_struct *);
3263-extern void exit_sighand(struct task_struct *);
3264+extern void exit_mm(task_t *);
3265+extern void exit_files(task_t *);
3266+extern void exit_sighand(task_t *);
3267
3268 extern void reparent_to_init(void);
3269 extern void daemonize(void);
3270+extern task_t *child_reaper;
3271
3272 extern int do_execve(char *, char **, char **, struct pt_regs *);
3273 extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long);
3274@@ -793,6 +784,9 @@
3275 extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
3276 extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
3277
3278+extern void wait_task_inactive(task_t * p);
3279+extern void kick_if_running(task_t * p);
3280+
3281 #define __wait_event(wq, condition) \
3282 do { \
3283 wait_queue_t __wait; \
3284@@ -871,24 +865,10 @@
3285 for (p = &init_task ; (p = p->next_task) != &init_task ; )
3286
3287 #define next_thread(p) \
3288- list_entry((p)->thread_group.next, struct task_struct, thread_group)
3289-
3290-static inline void del_from_runqueue(struct task_struct * p)
3291-{
3292- nr_running--;
3293- p->sleep_time = jiffies;
3294- list_del(&p->run_list);
3295- p->run_list.next = NULL;
3296-}
3297-
3298-static inline int task_on_runqueue(struct task_struct *p)
3299-{
3300- return (p->run_list.next != NULL);
3301-}
3302+ list_entry((p)->thread_group.next, task_t, thread_group)
3303
3304-static inline void unhash_process(struct task_struct *p)
3305+static inline void unhash_process(task_t *p)
3306 {
3307- if (task_on_runqueue(p)) BUG();
3308 write_lock_irq(&tasklist_lock);
3309 nr_threads--;
3310 unhash_pid(p);
3311@@ -898,12 +878,12 @@
3312 }
3313
3314 /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */
3315-static inline void task_lock(struct task_struct *p)
3316+static inline void task_lock(task_t *p)
3317 {
3318 spin_lock(&p->alloc_lock);
3319 }
3320
3321-static inline void task_unlock(struct task_struct *p)
3322+static inline void task_unlock(task_t *p)
3323 {
3324 spin_unlock(&p->alloc_lock);
3325 }
3326--- linux/include/linux/list.h.orig Tue Feb 5 13:51:51 2002
3327+++ linux/include/linux/list.h Tue Feb 5 13:52:12 2002
3328@@ -19,6 +19,8 @@
3329 struct list_head *next, *prev;
3330 };
3331
3332+typedef struct list_head list_t;
3333+
3334 #define LIST_HEAD_INIT(name) { &(name), &(name) }
3335
3336 #define LIST_HEAD(name) \
3337--- linux/include/linux/kernel_stat.h.orig Tue Aug 21 14:26:23 2001
3338+++ linux/include/linux/kernel_stat.h Tue Feb 5 13:52:12 2002
3339@@ -32,10 +32,11 @@
3340 unsigned int ipackets, opackets;
3341 unsigned int ierrors, oerrors;
3342 unsigned int collisions;
3343- unsigned int context_swtch;
3344 };
3345
3346 extern struct kernel_stat kstat;
3347+
3348+extern unsigned long nr_context_switches(void);
3349
3350 #if !defined(CONFIG_ARCH_S390)
3351 /*
3352--- linux/include/linux/smp.h.orig Sun Dec 31 20:10:17 2000
3353+++ linux/include/linux/smp.h Tue Feb 5 13:52:12 2002
3354@@ -86,6 +86,14 @@
3355 #define cpu_number_map(cpu) 0
3356 #define smp_call_function(func,info,retry,wait) ({ 0; })
3357 #define cpu_online_map 1
3358+static inline void smp_send_reschedule(int cpu) { }
3359+static inline void smp_send_reschedule_all(void) { }
3360
3361 #endif
3362+
3363+/*
3364+ * Common definitions:
3365+ */
3366+#define cpu() smp_processor_id()
3367+
3368 #endif
3369--- linux/include/asm-i386/smp.h.orig Tue Feb 5 13:51:51 2002
3370+++ linux/include/asm-i386/smp.h Tue Feb 5 13:52:12 2002
3371@@ -63,6 +63,7 @@
3372 extern void smp_flush_tlb(void);
3373 extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
3374 extern void smp_send_reschedule(int cpu);
3375+extern void smp_send_reschedule_all(void);
3376 extern void smp_invalidate_rcv(void); /* Process an NMI */
3377 extern void (*mtrr_hook) (void);
3378 extern void zap_low_mappings (void);
3379@@ -104,7 +105,7 @@
3380 * so this is correct in the x86 case.
3381 */
3382
3383-#define smp_processor_id() (current->processor)
3384+#define smp_processor_id() (current->cpu)
3385
3386 static __inline int hard_smp_processor_id(void)
3387 {
3388@@ -121,18 +122,6 @@
3389 #endif /* !__ASSEMBLY__ */
3390
3391 #define NO_PROC_ID 0xFF /* No processor magic marker */
3392-
3393-/*
3394- * This magic constant controls our willingness to transfer
3395- * a process across CPUs. Such a transfer incurs misses on the L1
3396- * cache, and on a P6 or P5 with multiple L2 caches L2 hits. My
3397- * gut feeling is this will vary by board in value. For a board
3398- * with separate L2 cache it probably depends also on the RSS, and
3399- * for a board with shared L2 cache it ought to decay fast as other
3400- * processes are run.
3401- */
3402-
3403-#define PROC_CHANGE_PENALTY 15 /* Schedule penalty */
3404
3405 #endif
3406 #endif
3407--- linux/include/asm-i386/bitops.h.orig Tue Aug 21 14:26:16 2001
3408+++ linux/include/asm-i386/bitops.h Tue Feb 5 13:52:12 2002
3409@@ -75,6 +75,14 @@
3410 :"=m" (ADDR)
3411 :"Ir" (nr));
3412 }
3413+
3414+static __inline__ void __clear_bit(int nr, volatile void * addr)
3415+{
3416+ __asm__ __volatile__(
3417+ "btrl %1,%0"
3418+ :"=m" (ADDR)
3419+ :"Ir" (nr));
3420+}
3421 #define smp_mb__before_clear_bit() barrier()
3422 #define smp_mb__after_clear_bit() barrier()
3423
3424@@ -284,6 +292,34 @@
3425 }
3426
3427 /**
3428+ * find_first_bit - find the first set bit in a memory region
3429+ * @addr: The address to start the search at
3430+ * @size: The maximum size to search
3431+ *
3432+ * Returns the bit-number of the first set bit, not the number of the byte
3433+ * containing a bit.
3434+ */
3435+static __inline__ int find_first_bit(void * addr, unsigned size)
3436+{
3437+ int d0, d1;
3438+ int res;
3439+
3440+ /* This looks at memory. Mark it volatile to tell gcc not to move it around */
3441+ __asm__ __volatile__(
3442+ "xorl %%eax,%%eax\n\t"
3443+ "repe; scasl\n\t"
3444+ "jz 1f\n\t"
3445+ "leal -4(%%edi),%%edi\n\t"
3446+ "bsfl (%%edi),%%eax\n"
3447+ "1:\tsubl %%ebx,%%edi\n\t"
3448+ "shll $3,%%edi\n\t"
3449+ "addl %%edi,%%eax"
3450+ :"=a" (res), "=&c" (d0), "=&D" (d1)
3451+ :"1" ((size + 31) >> 5), "2" (addr), "b" (addr));
3452+ return res;
3453+}
3454+
3455+/**
3456 * find_next_zero_bit - find the first zero bit in a memory region
3457 * @addr: The address to base the search on
3458 * @offset: The bitnumber to start searching at
3459@@ -296,7 +332,7 @@
3460
3461 if (bit) {
3462 /*
3463- * Look for zero in first byte
3464+ * Look for zero in the first 32 bits.
3465 */
3466 __asm__("bsfl %1,%0\n\t"
3467 "jne 1f\n\t"
3468@@ -317,6 +353,39 @@
3469 }
3470
3471 /**
3472+ * find_next_bit - find the first set bit in a memory region
3473+ * @addr: The address to base the search on
3474+ * @offset: The bitnumber to start searching at
3475+ * @size: The maximum size to search
3476+ */
3477+static __inline__ int find_next_bit (void * addr, int size, int offset)
3478+{
3479+ unsigned long * p = ((unsigned long *) addr) + (offset >> 5);
3480+ int set = 0, bit = offset & 31, res;
3481+
3482+ if (bit) {
3483+ /*
3484+ * Look for nonzero in the first 32 bits:
3485+ */
3486+ __asm__("bsfl %1,%0\n\t"
3487+ "jne 1f\n\t"
3488+ "movl $32, %0\n"
3489+ "1:"
3490+ : "=r" (set)
3491+ : "r" (*p >> bit));
3492+ if (set < (32 - bit))
3493+ return set + offset;
3494+ set = 32 - bit;
3495+ p++;
3496+ }
3497+ /*
3498+ * No set bit yet, search remaining full words for a bit
3499+ */
3500+ res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr));
3501+ return (offset + set + res);
3502+}
3503+
3504+/**
3505 * ffz - find first zero in word.
3506 * @word: The word to search
3507 *
3508@@ -327,6 +396,20 @@
3509 __asm__("bsfl %1,%0"
3510 :"=r" (word)
3511 :"r" (~word));
3512+ return word;
3513+}
3514+
3515+/**
3516+ * __ffs - find first bit in word.
3517+ * @word: The word to search
3518+ *
3519+ * Undefined if no bit exists, so code should check against 0 first.
3520+ */
3521+static __inline__ unsigned long __ffs(unsigned long word)
3522+{
3523+ __asm__("bsfl %1,%0"
3524+ :"=r" (word)
3525+ :"rm" (word));
3526 return word;
3527 }
3528
3529--- linux/include/asm-i386/pgalloc.h.orig Tue Feb 5 13:51:51 2002
3530+++ linux/include/asm-i386/pgalloc.h Tue Feb 5 13:52:12 2002
3531@@ -224,6 +224,7 @@
3532 {
3533 struct mm_struct *active_mm;
3534 int state;
3535+ char __cacheline_padding[24];
3536 };
3537 extern struct tlb_state cpu_tlbstate[NR_CPUS];
3538
3539--- linux/include/asm-i386/mmu_context.h.orig Tue Aug 21 14:26:23 2001
3540+++ linux/include/asm-i386/mmu_context.h Tue Feb 5 13:52:12 2002
3541@@ -7,6 +7,25 @@
3542 #include <asm/pgalloc.h>
3543
3544 /*
3545+ * Every architecture must define this function. It's the fastest
3546+ * way of searching a 140-bit bitmap where the first 100 bits are
3547+ * unlikely to be set. It's guaranteed that at least one of the 140
3548+ * bits is cleared.
3549+ */
3550+static inline int sched_find_first_bit(unsigned long *b)
3551+{
3552+ if (unlikely(b[0]))
3553+ return __ffs(b[0]);
3554+ if (unlikely(b[1]))
3555+ return __ffs(b[1]) + 32;
3556+ if (unlikely(b[2]))
3557+ return __ffs(b[2]) + 64;
3558+ if (b[3])
3559+ return __ffs(b[3]) + 96;
3560+ return __ffs(b[4]) + 128;
3561+}
3562+
3563+/*
3564 * possibly do the LDT unload here?
3565 */
3566 #define destroy_context(mm) do { } while(0)
3567@@ -27,13 +46,13 @@
3568
3569 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu)
3570 {
3571- if (prev != next) {
3572+ if (likely(prev != next)) {
3573 /* stop flush ipis for the previous mm */
3574 clear_bit(cpu, &prev->cpu_vm_mask);
3575 /*
3576 * Re-load LDT if necessary
3577 */
3578- if (prev->context.segments != next->context.segments)
3579+ if (unlikely(prev->context.segments != next->context.segments))
3580 load_LDT(next);
3581 #ifdef CONFIG_SMP
3582 cpu_tlbstate[cpu].state = TLBSTATE_OK;
3583--- linux/include/asm-i386/hw_irq.h.orig Tue Feb 5 13:51:40 2002
3584+++ linux/include/asm-i386/hw_irq.h Tue Feb 5 13:52:12 2002
3585@@ -41,7 +41,8 @@
3586 #define ERROR_APIC_VECTOR 0xfe
3587 #define INVALIDATE_TLB_VECTOR 0xfd
3588 #define RESCHEDULE_VECTOR 0xfc
3589-#define CALL_FUNCTION_VECTOR 0xfb
3590+#define TASK_MIGRATION_VECTOR 0xfb
3591+#define CALL_FUNCTION_VECTOR 0xfa
3592
3593 /*
3594 * Local APIC timer IRQ vector is on a different priority level,
3595--- linux/include/asm-i386/apic.h.orig Tue Feb 5 13:51:43 2002
3596+++ linux/include/asm-i386/apic.h Tue Feb 5 13:52:12 2002
3597@@ -79,6 +79,8 @@
3598 extern void setup_apic_nmi_watchdog (void);
3599 extern inline void nmi_watchdog_tick (struct pt_regs * regs);
3600 extern int APIC_init_uniprocessor (void);
3601+extern void disable_APIC_timer(void);
3602+extern void enable_APIC_timer(void);
3603
3604 extern struct pm_dev *apic_pm_register(pm_dev_t, unsigned long, pm_callback);
3605 extern void apic_pm_unregister(struct pm_dev*);
3606--- linux/net/unix/af_unix.c.orig Tue Feb 5 13:51:53 2002
3607+++ linux/net/unix/af_unix.c Tue Feb 5 13:52:12 2002
3608@@ -565,10 +565,8 @@
3609 addr->hash)) {
3610 write_unlock(&unix_table_lock);
3611 /* Sanity yield. It is unusual case, but yet... */
3612- if (!(ordernum&0xFF)) {
3613- current->policy |= SCHED_YIELD;
3614- schedule();
3615- }
3616+ if (!(ordernum&0xFF))
3617+ yield();
3618 goto retry;
3619 }
3620 addr->hash ^= sk->type;
3621--- linux/net/ipv4/tcp_output.c.orig Tue Feb 5 13:51:51 2002
3622+++ linux/net/ipv4/tcp_output.c Tue Feb 5 13:52:12 2002
3623@@ -1009,8 +1009,7 @@
3624 skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
3625 if (skb)
3626 break;
3627- current->policy |= SCHED_YIELD;
3628- schedule();
3629+ yield();
3630 }
3631
3632 /* Reserve space for headers and prepare control bits. */
3633--- linux/net/sunrpc/sched.c.orig Tue Feb 5 13:51:53 2002
3634+++ linux/net/sunrpc/sched.c Tue Feb 5 13:52:12 2002
3635@@ -773,8 +773,7 @@
3636 }
3637 if (flags & RPC_TASK_ASYNC)
3638 return NULL;
3639- current->policy |= SCHED_YIELD;
3640- schedule();
3641+ yield();
3642 } while (!signalled());
3643
3644 return NULL;
3645@@ -1115,8 +1114,7 @@
3646 __rpc_schedule();
3647 if (all_tasks) {
3648 dprintk("rpciod_killall: waiting for tasks to exit\n");
3649- current->policy |= SCHED_YIELD;
3650- schedule();
3651+ yield();
3652 }
3653 }
3654
3655@@ -1186,8 +1184,7 @@
3656 * wait briefly before checking the process id.
3657 */
3658 current->sigpending = 0;
3659- current->policy |= SCHED_YIELD;
3660- schedule();
3661+ yield();
3662 /*
3663 * Display a message if we're going to wait longer.
3664 */
3665--- linux/net/sched/sch_generic.c.orig Fri Aug 18 19:26:25 2000
3666+++ linux/net/sched/sch_generic.c Tue Feb 5 13:52:12 2002
3667@@ -475,10 +475,8 @@
3668
3669 dev_watchdog_down(dev);
3670
3671- while (test_bit(__LINK_STATE_SCHED, &dev->state)) {
3672- current->policy |= SCHED_YIELD;
3673- schedule();
3674- }
3675+ while (test_bit(__LINK_STATE_SCHED, &dev->state))
3676+ yield();
3677
3678 spin_unlock_wait(&dev->xmit_lock);
3679 }
3680--- linux/net/socket.c.orig Tue Feb 5 13:51:51 2002
3681+++ linux/net/socket.c Tue Feb 5 13:52:12 2002
3682@@ -148,8 +148,7 @@
3683 while (atomic_read(&net_family_lockct) != 0) {
3684 spin_unlock(&net_family_lock);
3685
3686- current->policy |= SCHED_YIELD;
3687- schedule();
3688+ yield();
3689
3690 spin_lock(&net_family_lock);
3691 }
3692--- linux/drivers/net/slip.c.orig Tue Feb 5 13:51:52 2002
3693+++ linux/drivers/net/slip.c Tue Feb 5 13:52:12 2002
3694@@ -1393,10 +1393,8 @@
3695 /* First of all: check for active disciplines and hangup them.
3696 */
3697 do {
3698- if (busy) {
3699- current->counter = 0;
3700- schedule();
3701- }
3702+ if (busy)
3703+ sys_sched_yield();
3704
3705 busy = 0;
3706 local_bh_disable();
3707--- linux/drivers/block/loop.c.orig Tue Feb 5 13:51:50 2002
3708+++ linux/drivers/block/loop.c Tue Feb 5 13:52:12 2002
3709@@ -570,9 +570,6 @@
3710 flush_signals(current);
3711 spin_unlock_irq(&current->sigmask_lock);
3712
3713- current->policy = SCHED_OTHER;
3714- current->nice = -20;
3715-
3716 spin_lock_irq(&lo->lo_lock);
3717 lo->lo_state = Lo_bound;
3718 atomic_inc(&lo->lo_pending);
3719--- linux/drivers/char/mwave/mwavedd.c.orig Tue Feb 5 13:51:44 2002
3720+++ linux/drivers/char/mwave/mwavedd.c Tue Feb 5 13:52:12 2002
3721@@ -279,7 +279,6 @@
3722 pDrvData->IPCs[ipcnum].bIsHere = FALSE;
3723 pDrvData->IPCs[ipcnum].bIsEnabled = TRUE;
3724 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
3725- current->nice = -20; /* boost to provide priority timing */
3726 #else
3727 current->priority = 0x28; /* boost to provide priority timing */
3728 #endif
3729--- linux/drivers/char/drm-4.0/ffb_drv.c.orig Tue Feb 5 13:51:51 2002
3730+++ linux/drivers/char/drm-4.0/ffb_drv.c Tue Feb 5 13:52:12 2002
3731@@ -710,8 +710,7 @@
3732 /* Contention */
3733 atomic_inc(&dev->total_sleeps);
3734 current->state = TASK_INTERRUPTIBLE;
3735- current->policy |= SCHED_YIELD;
3736- schedule();
3737+ yield();
3738 if (signal_pending(current)) {
3739 ret = -ERESTARTSYS;
3740 break;
3741--- linux/drivers/char/drm-4.0/tdfx_drv.c.orig Tue Feb 5 13:51:52 2002
3742+++ linux/drivers/char/drm-4.0/tdfx_drv.c Tue Feb 5 13:52:12 2002
3743@@ -554,7 +554,6 @@
3744 lock.context, current->pid, j,
3745 dev->lock.lock_time, jiffies);
3746 current->state = TASK_INTERRUPTIBLE;
3747- current->policy |= SCHED_YIELD;
3748 schedule_timeout(DRM_LOCK_SLICE-j);
3749 DRM_DEBUG("jiffies=%d\n", jiffies);
3750 }
3751@@ -578,10 +577,7 @@
3752
3753 /* Contention */
3754 atomic_inc(&dev->total_sleeps);
3755-#if 1
3756- current->policy |= SCHED_YIELD;
3757-#endif
3758- schedule();
3759+ yield();
3760 if (signal_pending(current)) {
3761 ret = -ERESTARTSYS;
3762 break;
3763@@ -604,8 +600,7 @@
3764 when dev->last_context == lock.context
3765 NOTE WE HOLD THE LOCK THROUGHOUT THIS
3766 TIME! */
3767- current->policy |= SCHED_YIELD;
3768- schedule();
3769+ yield();
3770 current->state = TASK_RUNNING;
3771 remove_wait_queue(&dev->context_wait, &entry);
3772 if (signal_pending(current)) {
3773--- linux/drivers/ide/ataraid.c.orig Tue Feb 5 13:51:46 2002
3774+++ linux/drivers/ide/ataraid.c Tue Feb 5 13:52:12 2002
3775@@ -123,8 +123,7 @@
3776 ptr=kmalloc(sizeof(struct buffer_head),GFP_NOIO);
3777 if (!ptr) {
3778 __set_current_state(TASK_RUNNING);
3779- current->policy |= SCHED_YIELD;
3780- schedule();
3781+ yield();
3782 }
3783 }
3784 return ptr;
3785@@ -139,8 +138,7 @@
3786 ptr=kmalloc(sizeof(struct ataraid_bh_private),GFP_NOIO);
3787 if (!ptr) {
3788 __set_current_state(TASK_RUNNING);
3789- current->policy |= SCHED_YIELD;
3790- schedule();
3791+ yield();
3792 }
3793 }
3794 return ptr;
3795--- linux/drivers/md/md.c.orig Tue Feb 5 13:51:52 2002
3796+++ linux/drivers/md/md.c Tue Feb 5 13:52:12 2002
3797@@ -2936,8 +2936,6 @@
3798 * bdflush, otherwise bdflush will deadlock if there are too
3799 * many dirty RAID5 blocks.
3800 */
3801- current->policy = SCHED_OTHER;
3802- current->nice = -20;
3803 md_unlock_kernel();
3804
3805 complete(thread->event);
3806@@ -3387,11 +3385,6 @@
3807 "(but not more than %d KB/sec) for reconstruction.\n",
3808 sysctl_speed_limit_max);
3809
3810- /*
3811- * Resync has low priority.
3812- */
3813- current->nice = 19;
3814-
3815 is_mddev_idle(mddev); /* this also initializes IO event counters */
3816 for (m = 0; m < SYNC_MARKS; m++) {
3817 mark[m] = jiffies;
3818@@ -3469,16 +3462,13 @@
3819 currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
3820
3821 if (currspeed > sysctl_speed_limit_min) {
3822- current->nice = 19;
3823-
3824 if ((currspeed > sysctl_speed_limit_max) ||
3825 !is_mddev_idle(mddev)) {
3826 current->state = TASK_INTERRUPTIBLE;
3827 md_schedule_timeout(HZ/4);
3828 goto repeat;
3829 }
3830- } else
3831- current->nice = -20;
3832+ }
3833 }
3834 printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
3835 err = 0;
3836--- linux/arch/i386/mm/fault.c.orig Tue Feb 5 13:51:51 2002
3837+++ linux/arch/i386/mm/fault.c Tue Feb 5 13:52:12 2002
3838@@ -86,8 +86,7 @@
3839
3840 out_of_memory:
3841 if (current->pid == 1) {
3842- current->policy |= SCHED_YIELD;
3843- schedule();
3844+ yield();
3845 goto survive;
3846 }
3847 goto bad_area;
3848@@ -342,8 +341,7 @@
3849 out_of_memory:
3850 up_read(&mm->mmap_sem);
3851 if (tsk->pid == 1) {
3852- tsk->policy |= SCHED_YIELD;
3853- schedule();
3854+ yield();
3855 down_read(&mm->mmap_sem);
3856 goto survive;
3857 }
3858--- linux/arch/i386/kernel/smpboot.c.orig Tue Feb 5 13:51:49 2002
3859+++ linux/arch/i386/kernel/smpboot.c Tue Feb 5 13:52:12 2002
3860@@ -308,14 +308,14 @@
3861 if (tsc_values[i] < avg)
3862 realdelta = -realdelta;
3863
3864- printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n",
3865- i, realdelta);
3866+ printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", i, realdelta);
3867 }
3868
3869 sum += delta;
3870 }
3871 if (!buggy)
3872 printk("passed.\n");
3873+ ;
3874 }
3875
3876 static void __init synchronize_tsc_ap (void)
3877@@ -365,7 +365,7 @@
3878 * (This works even if the APIC is not enabled.)
3879 */
3880 phys_id = GET_APIC_ID(apic_read(APIC_ID));
3881- cpuid = current->processor;
3882+ cpuid = cpu();
3883 if (test_and_set_bit(cpuid, &cpu_online_map)) {
3884 printk("huh, phys CPU#%d, CPU#%d already present??\n",
3885 phys_id, cpuid);
3886@@ -435,6 +435,7 @@
3887 */
3888 smp_store_cpu_info(cpuid);
3889
3890+ disable_APIC_timer();
3891 /*
3892 * Allow the master to continue.
3893 */
3894@@ -465,6 +466,7 @@
3895 smp_callin();
3896 while (!atomic_read(&smp_commenced))
3897 rep_nop();
3898+ enable_APIC_timer();
3899 /*
3900 * low-memory mappings have been cleared, flush them from
3901 * the local TLBs too.
3902@@ -803,16 +805,13 @@
3903 if (!idle)
3904 panic("No idle process for CPU %d", cpu);
3905
3906- idle->processor = cpu;
3907- idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */
3908+ init_idle(idle, cpu);
3909
3910 map_cpu_to_boot_apicid(cpu, apicid);
3911
3912 idle->thread.eip = (unsigned long) start_secondary;
3913
3914- del_from_runqueue(idle);
3915 unhash_process(idle);
3916- init_tasks[cpu] = idle;
3917
3918 /* start_eip had better be page-aligned! */
3919 start_eip = setup_trampoline();
3920@@ -925,6 +924,7 @@
3921 }
3922
3923 cycles_t cacheflush_time;
3924+unsigned long cache_decay_ticks;
3925
3926 static void smp_tune_scheduling (void)
3927 {
3928@@ -958,9 +958,13 @@
3929 cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth;
3930 }
3931
3932+ cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000;
3933+
3934 printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
3935 (long)cacheflush_time/(cpu_khz/1000),
3936 ((long)cacheflush_time*100/(cpu_khz/1000)) % 100);
3937+ printk("task migration cache decay timeout: %ld msecs.\n",
3938+ (cache_decay_ticks + 1) * 1000 / HZ);
3939 }
3940
3941 /*
3942@@ -1020,8 +1024,7 @@
3943 map_cpu_to_boot_apicid(0, boot_cpu_apicid);
3944
3945 global_irq_holder = 0;
3946- current->processor = 0;
3947- init_idle();
3948+ current->cpu = 0;
3949 smp_tune_scheduling();
3950
3951 /*
3952--- linux/arch/i386/kernel/process.c.orig Tue Feb 5 13:51:51 2002
3953+++ linux/arch/i386/kernel/process.c Tue Feb 5 13:52:12 2002
3954@@ -123,15 +123,12 @@
3955 void cpu_idle (void)
3956 {
3957 /* endless idle loop with no priority at all */
3958- init_idle();
3959- current->nice = 20;
3960- current->counter = -100;
3961
3962 while (1) {
3963 void (*idle)(void) = pm_idle;
3964 if (!idle)
3965 idle = default_idle;
3966- while (!current->need_resched)
3967+ if (!current->need_resched)
3968 idle();
3969 schedule();
3970 check_pgt_cache();
3971@@ -694,15 +691,17 @@
3972 asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
3973
3974 /*
3975- * Restore %fs and %gs.
3976+ * Restore %fs and %gs if needed.
3977 */
3978- loadsegment(fs, next->fs);
3979- loadsegment(gs, next->gs);
3980+ if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) {
3981+ loadsegment(fs, next->fs);
3982+ loadsegment(gs, next->gs);
3983+ }
3984
3985 /*
3986 * Now maybe reload the debug registers
3987 */
3988- if (next->debugreg[7]){
3989+ if (unlikely(next->debugreg[7])) {
3990 loaddebug(next, 0);
3991 loaddebug(next, 1);
3992 loaddebug(next, 2);
3993@@ -712,7 +711,7 @@
3994 loaddebug(next, 7);
3995 }
3996
3997- if (prev->ioperm || next->ioperm) {
3998+ if (unlikely(prev->ioperm || next->ioperm)) {
3999 if (next->ioperm) {
4000 /*
4001 * 4 cachelines copy ... not good, but not that
4002--- linux/arch/i386/kernel/apic.c.orig Tue Feb 5 13:51:51 2002
4003+++ linux/arch/i386/kernel/apic.c Tue Feb 5 13:52:12 2002
4004@@ -796,8 +796,7 @@
4005 */
4006
4007 slice = clocks / (smp_num_cpus+1);
4008- printk("cpu: %d, clocks: %d, slice: %d\n",
4009- smp_processor_id(), clocks, slice);
4010+ printk("cpu: %d, clocks: %d, slice: %d\n", smp_processor_id(), clocks, slice);
4011
4012 /*
4013 * Wait for IRQ0's slice:
4014@@ -820,8 +819,7 @@
4015
4016 __setup_APIC_LVTT(clocks);
4017
4018- printk("CPU%d<T0:%d,T1:%d,D:%d,S:%d,C:%d>\n",
4019- smp_processor_id(), t0, t1, delta, slice, clocks);
4020+ printk("CPU%d<T0:%d,T1:%d,D:%d,S:%d,C:%d>\n", smp_processor_id(), t0, t1, delta, slice, clocks);
4021
4022 __restore_flags(flags);
4023 }
4024@@ -922,6 +920,26 @@
4025
4026 /* and update all other cpus */
4027 smp_call_function(setup_APIC_timer, (void *)calibration_result, 1, 1);
4028+}
4029+
4030+void __init disable_APIC_timer(void)
4031+{
4032+ if (using_apic_timer) {
4033+ unsigned long v;
4034+
4035+ v = apic_read(APIC_LVTT);
4036+ apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
4037+ }
4038+}
4039+
4040+void enable_APIC_timer(void)
4041+{
4042+ if (using_apic_timer) {
4043+ unsigned long v;
4044+
4045+ v = apic_read(APIC_LVTT);
4046+ apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
4047+ }
4048 }
4049
4050 /*
4051--- linux/arch/i386/kernel/nmi.c.orig Tue Feb 5 13:51:36 2002
4052+++ linux/arch/i386/kernel/nmi.c Tue Feb 5 13:52:12 2002
4053@@ -283,7 +283,7 @@
4054 * to get a message out.
4055 */
4056 bust_spinlocks(1);
4057- printk("NMI Watchdog detected LOCKUP on CPU%d, registers:\n", cpu);
4058+ printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip);
4059 show_registers(regs);
4060 printk("console shuts up ...\n");
4061 console_silent();
4062--- linux/arch/i386/kernel/smp.c.orig Tue Feb 5 13:51:49 2002
4063+++ linux/arch/i386/kernel/smp.c Tue Feb 5 13:52:12 2002
4064@@ -105,7 +105,7 @@
4065 /* The 'big kernel lock' */
4066 spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
4067
4068-struct tlb_state cpu_tlbstate[NR_CPUS] = {[0 ... NR_CPUS-1] = { &init_mm, 0 }};
4069+struct tlb_state cpu_tlbstate[NR_CPUS] __cacheline_aligned = {[0 ... NR_CPUS-1] = { &init_mm, 0, }};
4070
4071 /*
4072 * the following functions deal with sending IPIs between CPUs.
4073@@ -485,15 +485,54 @@
4074 do_flush_tlb_all_local();
4075 }
4076
4077+static spinlock_t migration_lock = SPIN_LOCK_UNLOCKED;
4078+static task_t *new_task;
4079+
4080+/*
4081+ * This function sends a 'task migration' IPI to another CPU.
4082+ * Must be called from syscall contexts, with interrupts *enabled*.
4083+ */
4084+void smp_migrate_task(int cpu, task_t *p)
4085+{
4086+ /*
4087+ * The target CPU will unlock the migration spinlock:
4088+ */
4089+ spin_lock(&migration_lock);
4090+ new_task = p;
4091+ send_IPI_mask(1 << cpu, TASK_MIGRATION_VECTOR);
4092+}
4093+
4094+/*
4095+ * Task migration callback.
4096+ */
4097+asmlinkage void smp_task_migration_interrupt(void)
4098+{
4099+ task_t *p;
4100+
4101+ ack_APIC_irq();
4102+ p = new_task;
4103+ spin_unlock(&migration_lock);
4104+ sched_task_migrated(p);
4105+}
4106 /*
4107 * this function sends a 'reschedule' IPI to another CPU.
4108 * it goes straight through and wastes no time serializing
4109 * anything. Worst case is that we lose a reschedule ...
4110 */
4111-
4112 void smp_send_reschedule(int cpu)
4113 {
4114 send_IPI_mask(1 << cpu, RESCHEDULE_VECTOR);
4115+}
4116+
4117+/*
4118+ * this function sends a reschedule IPI to all (other) CPUs.
4119+ * This should only be used if some 'global' task became runnable,
4120+ * such as a RT task, that must be handled now. The first CPU
4121+ * that manages to grab the task will run it.
4122+ */
4123+void smp_send_reschedule_all(void)
4124+{
4125+ send_IPI_allbutself(RESCHEDULE_VECTOR);
4126 }
4127
4128 /*
4129--- linux/arch/i386/kernel/i8259.c.orig Tue Feb 5 13:51:36 2002
4130+++ linux/arch/i386/kernel/i8259.c Tue Feb 5 13:52:12 2002
4131@@ -79,6 +79,7 @@
4132 * through the ICC by us (IPIs)
4133 */
4134 #ifdef CONFIG_SMP
4135+BUILD_SMP_INTERRUPT(task_migration_interrupt,TASK_MIGRATION_VECTOR)
4136 BUILD_SMP_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
4137 BUILD_SMP_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
4138 BUILD_SMP_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
4139@@ -472,6 +473,9 @@
4140 * IPI, driven by wakeup.
4141 */
4142 set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
4143+
4144+ /* IPI for task migration */
4145+ set_intr_gate(TASK_MIGRATION_VECTOR, task_migration_interrupt);
4146
4147 /* IPI for invalidation */
4148 set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
4149--- linux/arch/i386/kernel/entry.S.orig Tue Feb 5 13:51:51 2002
4150+++ linux/arch/i386/kernel/entry.S Tue Feb 5 13:52:12 2002
4151@@ -77,7 +77,7 @@
4152 exec_domain = 16
4153 need_resched = 20
4154 tsk_ptrace = 24
4155-processor = 52
4156+cpu = 32
4157
4158 ENOSYS = 38
4159
4160@@ -176,9 +176,11 @@
4161
4162
4163 ENTRY(ret_from_fork)
4164+#if CONFIG_SMP
4165 pushl %ebx
4166 call SYMBOL_NAME(schedule_tail)
4167 addl $4, %esp
4168+#endif
4169 GET_CURRENT(%ebx)
4170 testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS
4171 jne tracesys_exit
4172--- linux/arch/i386/kernel/setup.c.orig Tue Feb 5 13:51:51 2002
4173+++ linux/arch/i386/kernel/setup.c Tue Feb 5 13:52:12 2002
4174@@ -2924,9 +2924,10 @@
4175 load_TR(nr);
4176 load_LDT(&init_mm);
4177
4178- /*
4179- * Clear all 6 debug registers:
4180- */
4181+ /* Clear %fs and %gs. */
4182+ asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
4183+
4184+ /* Clear all 6 debug registers: */
4185
4186 #define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) );
4187
This page took 0.688722 seconds and 4 git commands to generate.