]>
Commit | Line | Data |
---|---|---|
d48ecf2a JR |
1 | --- linux/fs/proc/proc_misc.c.orig Sun Jan 6 13:55:55 2002 |
2 | +++ linux/fs/proc/proc_misc.c Sun Jan 6 13:56:25 2002 | |
3 | @@ -85,11 +85,11 @@ | |
4 | a = avenrun[0] + (FIXED_1/200); | |
5 | b = avenrun[1] + (FIXED_1/200); | |
6 | c = avenrun[2] + (FIXED_1/200); | |
7 | - len = sprintf(page,"%d.%02d %d.%02d %d.%02d %d/%d %d\n", | |
8 | + len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n", | |
9 | LOAD_INT(a), LOAD_FRAC(a), | |
10 | LOAD_INT(b), LOAD_FRAC(b), | |
11 | LOAD_INT(c), LOAD_FRAC(c), | |
12 | - nr_running, nr_threads, last_pid); | |
13 | + nr_running(), nr_threads, last_pid); | |
14 | return proc_calc_metrics(page, start, off, count, eof, len); | |
15 | } | |
16 | ||
17 | @@ -101,7 +101,7 @@ | |
18 | int len; | |
19 | ||
20 | uptime = jiffies; | |
21 | - idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime; | |
22 | + idle = init_task.times.tms_utime + init_task.times.tms_stime; | |
23 | ||
24 | /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but | |
25 | that would overflow about every five days at HZ == 100. | |
26 | @@ -303,10 +303,10 @@ | |
27 | } | |
28 | ||
29 | len += sprintf(page + len, | |
30 | - "\nctxt %u\n" | |
31 | + "\nctxt %lu\n" | |
32 | "btime %lu\n" | |
33 | "processes %lu\n", | |
34 | - kstat.context_swtch, | |
35 | + nr_context_switches(), | |
36 | xtime.tv_sec - jif / HZ, | |
37 | total_forks); | |
38 | ||
39 | --- linux/fs/proc/array.c.orig Sun Jan 6 13:55:51 2002 | |
40 | +++ linux/fs/proc/array.c Mon Jan 7 20:01:05 2002 | |
41 | @@ -335,9 +335,12 @@ | |
42 | ||
43 | /* scale priority and nice values from timeslices to -20..20 */ | |
44 | /* to make it look like a "normal" Unix priority/nice value */ | |
45 | - priority = task->counter; | |
46 | - priority = 20 - (priority * 10 + DEF_COUNTER / 2) / DEF_COUNTER; | |
47 | - nice = task->nice; | |
48 | + priority = task->prio; | |
49 | + if (priority >= MAX_RT_PRIO) | |
50 | + priority -= MAX_RT_PRIO; | |
51 | + else | |
52 | + priority = priority-100; | |
53 | + nice = task->__nice; | |
54 | ||
55 | read_lock(&tasklist_lock); | |
56 | ppid = task->pid ? task->p_opptr->pid : 0; | |
57 | @@ -387,7 +390,7 @@ | |
58 | task->nswap, | |
59 | task->cnswap, | |
60 | task->exit_signal, | |
61 | - task->processor); | |
62 | + task->cpu); | |
63 | if(mm) | |
64 | mmput(mm); | |
65 | return res; | |
66 | --- linux/fs/nfs/pagelist.c.orig Sun Jan 6 13:55:57 2002 | |
67 | +++ linux/fs/nfs/pagelist.c Sun Jan 6 13:56:25 2002 | |
68 | @@ -96,8 +96,7 @@ | |
69 | continue; | |
70 | if (signalled() && (server->flags & NFS_MOUNT_INTR)) | |
71 | return ERR_PTR(-ERESTARTSYS); | |
72 | - current->policy = SCHED_YIELD; | |
73 | - schedule(); | |
74 | + yield(); | |
75 | } | |
76 | ||
77 | /* Initialize the request struct. Initially, we assume a | |
78 | --- linux/fs/ufs/truncate.c.orig Sun Jan 6 13:55:55 2002 | |
79 | +++ linux/fs/ufs/truncate.c Sun Jan 6 13:56:25 2002 | |
80 | @@ -448,10 +448,7 @@ | |
81 | if (IS_SYNC(inode) && (inode->i_state & I_DIRTY)) | |
82 | ufs_sync_inode (inode); | |
83 | run_task_queue(&tq_disk); | |
84 | - current->policy |= SCHED_YIELD; | |
85 | - schedule (); | |
86 | - | |
87 | - | |
88 | + yield(); | |
89 | } | |
90 | offset = inode->i_size & uspi->s_fshift; | |
91 | if (offset) { | |
92 | --- linux/fs/reiserfs/buffer2.c.orig Sun Jan 6 13:55:57 2002 | |
93 | +++ linux/fs/reiserfs/buffer2.c Sun Jan 6 13:56:25 2002 | |
94 | @@ -33,8 +33,7 @@ | |
95 | buffer_journal_dirty(bh) ? ' ' : '!'); | |
96 | } | |
97 | run_task_queue(&tq_disk); | |
98 | - current->policy |= SCHED_YIELD; | |
99 | - schedule(); | |
100 | + yield(); | |
101 | } | |
102 | if (repeat_counter > 30000000) { | |
103 | reiserfs_warning("vs-3051: done waiting, ignore vs-3050 messages for (%b)\n", bh) ; | |
104 | @@ -52,11 +51,11 @@ | |
105 | struct buffer_head * reiserfs_bread (struct super_block *super, int n_block, int n_size) | |
106 | { | |
107 | struct buffer_head *result; | |
108 | - PROC_EXP( unsigned int ctx_switches = kstat.context_swtch ); | |
109 | + PROC_EXP( unsigned int ctx_switches = nr_context_switches(); ); | |
110 | ||
111 | result = bread (super -> s_dev, n_block, n_size); | |
112 | PROC_INFO_INC( super, breads ); | |
113 | - PROC_EXP( if( kstat.context_swtch != ctx_switches ) | |
114 | + PROC_EXP( if( nr_context_switches() != ctx_switches ) | |
115 | PROC_INFO_INC( super, bread_miss ) ); | |
116 | return result; | |
117 | } | |
118 | --- linux/fs/reiserfs/journal.c.orig Sun Jan 6 13:55:57 2002 | |
119 | +++ linux/fs/reiserfs/journal.c Sun Jan 6 13:56:25 2002 | |
120 | @@ -149,8 +149,7 @@ | |
121 | } | |
122 | bn = allocate_bitmap_node(p_s_sb) ; | |
123 | if (!bn) { | |
124 | - current->policy |= SCHED_YIELD ; | |
125 | - schedule() ; | |
126 | + yield(); | |
127 | goto repeat ; | |
128 | } | |
129 | return bn ; | |
130 | --- linux/fs/jffs2/background.c.orig Sun Jan 6 13:55:53 2002 | |
131 | +++ linux/fs/jffs2/background.c Sun Jan 6 13:56:25 2002 | |
132 | @@ -106,9 +106,6 @@ | |
133 | ||
134 | sprintf(current->comm, "jffs2_gcd_mtd%d", c->mtd->index); | |
135 | ||
136 | - /* FIXME in the 2.2 backport */ | |
137 | - current->nice = 10; | |
138 | - | |
139 | for (;;) { | |
140 | spin_lock_irq(¤t->sigmask_lock); | |
141 | siginitsetinv (¤t->blocked, sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT)); | |
142 | --- linux/fs/jbd/journal.c.orig Sun Jan 6 13:55:57 2002 | |
143 | +++ linux/fs/jbd/journal.c Sun Jan 6 13:56:25 2002 | |
144 | @@ -460,8 +460,7 @@ | |
145 | printk (KERN_NOTICE __FUNCTION__ | |
146 | ": ENOMEM at get_unused_buffer_head, " | |
147 | "trying again.\n"); | |
148 | - current->policy |= SCHED_YIELD; | |
149 | - schedule(); | |
150 | + yield(); | |
151 | } | |
152 | } while (!new_bh); | |
153 | /* keep subsequent assertions sane */ | |
154 | @@ -1539,8 +1538,7 @@ | |
155 | last_warning = jiffies; | |
156 | } | |
157 | ||
158 | - current->policy |= SCHED_YIELD; | |
159 | - schedule(); | |
160 | + yield(); | |
161 | } | |
162 | } | |
163 | ||
164 | @@ -1598,8 +1596,7 @@ | |
165 | last_warning = jiffies; | |
166 | } | |
167 | while (ret == 0) { | |
168 | - current->policy |= SCHED_YIELD; | |
169 | - schedule(); | |
170 | + yield(); | |
171 | ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); | |
172 | } | |
173 | } | |
174 | --- linux/fs/jbd/revoke.c.orig Sun Jan 6 13:55:57 2002 | |
175 | +++ linux/fs/jbd/revoke.c Sun Jan 6 13:56:25 2002 | |
176 | @@ -137,8 +137,7 @@ | |
177 | if (!journal_oom_retry) | |
178 | return -ENOMEM; | |
179 | jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n"); | |
180 | - current->policy |= SCHED_YIELD; | |
181 | - schedule(); | |
182 | + yield(); | |
183 | goto repeat; | |
184 | } | |
185 | ||
186 | --- linux/fs/jbd/transaction.c.orig Sun Jan 6 13:55:57 2002 | |
187 | +++ linux/fs/jbd/transaction.c Sun Jan 6 13:56:25 2002 | |
188 | @@ -1377,8 +1377,7 @@ | |
189 | do { | |
190 | old_handle_count = transaction->t_handle_count; | |
191 | set_current_state(TASK_RUNNING); | |
192 | - current->policy |= SCHED_YIELD; | |
193 | - schedule(); | |
194 | + yield(); | |
195 | } while (old_handle_count != transaction->t_handle_count); | |
196 | } | |
197 | ||
198 | --- linux/fs/binfmt_elf.c.orig Sun Jan 6 13:55:57 2002 | |
199 | +++ linux/fs/binfmt_elf.c Sun Jan 6 13:56:25 2002 | |
200 | @@ -1143,7 +1143,7 @@ | |
201 | psinfo.pr_state = i; | |
202 | psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i]; | |
203 | psinfo.pr_zomb = psinfo.pr_sname == 'Z'; | |
204 | - psinfo.pr_nice = current->nice; | |
205 | + psinfo.pr_nice = current->__nice; | |
206 | psinfo.pr_flag = current->flags; | |
207 | psinfo.pr_uid = NEW_TO_OLD_UID(current->uid); | |
208 | psinfo.pr_gid = NEW_TO_OLD_GID(current->gid); | |
209 | --- linux/fs/buffer.c.orig Sun Jan 6 13:55:57 2002 | |
210 | +++ linux/fs/buffer.c Fri Jan 25 14:25:56 2002 | |
211 | @@ -725,9 +725,8 @@ | |
212 | wakeup_bdflush(); | |
213 | try_to_free_pages(zone, GFP_NOFS, 0); | |
214 | run_task_queue(&tq_disk); | |
215 | - current->policy |= SCHED_YIELD; | |
216 | __set_current_state(TASK_RUNNING); | |
217 | - schedule(); | |
218 | + sys_sched_yield(); | |
219 | } | |
220 | ||
221 | void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) | |
222 | --- linux/fs/locks.c.orig Sun Jan 6 13:55:51 2002 | |
223 | +++ linux/fs/locks.c Sun Jan 6 13:56:25 2002 | |
224 | @@ -445,8 +445,7 @@ | |
225 | /* Let the blocked process remove waiter from the | |
226 | * block list when it gets scheduled. | |
227 | */ | |
228 | - current->policy |= SCHED_YIELD; | |
229 | - schedule(); | |
230 | + yield(); | |
231 | } else { | |
232 | /* Remove waiter from the block list, because by the | |
233 | * time it wakes up blocker won't exist any more. | |
234 | --- linux/init/main.c.orig Sun Jan 6 13:55:57 2002 | |
235 | +++ linux/init/main.c Mon Jan 28 18:12:51 2002 | |
236 | @@ -482,8 +482,6 @@ | |
237 | extern void setup_arch(char **); | |
238 | extern void cpu_idle(void); | |
239 | ||
240 | -unsigned long wait_init_idle; | |
241 | - | |
242 | #ifndef CONFIG_SMP | |
243 | ||
244 | #ifdef CONFIG_X86_LOCAL_APIC | |
245 | @@ -492,34 +490,24 @@ | |
246 | APIC_init_uniprocessor(); | |
247 | } | |
248 | #else | |
249 | -#define smp_init() do { } while (0) | |
250 | +#define smp_init() do { } while (0) | |
251 | #endif | |
252 | ||
253 | #else | |
254 | ||
255 | - | |
256 | /* Called by boot processor to activate the rest. */ | |
257 | static void __init smp_init(void) | |
258 | { | |
259 | /* Get other processors into their bootup holding patterns. */ | |
260 | smp_boot_cpus(); | |
261 | - wait_init_idle = cpu_online_map; | |
262 | - clear_bit(current->processor, &wait_init_idle); /* Don't wait on me! */ | |
263 | ||
264 | smp_threads_ready=1; | |
265 | smp_commence(); | |
266 | - | |
267 | - /* Wait for the other cpus to set up their idle processes */ | |
268 | - printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle); | |
269 | - while (wait_init_idle) { | |
270 | - cpu_relax(); | |
271 | - barrier(); | |
272 | - } | |
273 | - printk("All processors have done init_idle\n"); | |
274 | } | |
275 | ||
276 | #endif | |
277 | ||
278 | + | |
279 | /* | |
280 | * We need to finalize in a non-__init function or else race conditions | |
281 | * between the root thread and the init thread may cause start_kernel to | |
282 | @@ -531,9 +519,8 @@ | |
283 | { | |
284 | kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); | |
285 | unlock_kernel(); | |
286 | - current->need_resched = 1; | |
287 | - cpu_idle(); | |
288 | -} | |
289 | + cpu_idle(); | |
290 | +} | |
291 | ||
292 | /* | |
293 | * Activate the first processor. | |
294 | @@ -611,14 +598,18 @@ | |
295 | ipc_init(); | |
296 | #endif | |
297 | check_bugs(); | |
298 | + | |
299 | printk("POSIX conformance testing by UNIFIX\n"); | |
300 | ||
301 | - /* | |
302 | - * We count on the initial thread going ok | |
303 | - * Like idlers init is an unlocked kernel thread, which will | |
304 | - * make syscalls (and thus be locked). | |
305 | + init_idle(current, smp_processor_id()); | |
306 | + /* | |
307 | + * We count on the initial thread going ok | |
308 | + * Like idlers init is an unlocked kernel thread, which will | |
309 | + * make syscalls (and thus be locked). | |
310 | */ | |
311 | smp_init(); | |
312 | + | |
313 | + /* Do the rest non-__init'ed, we're now alive */ | |
314 | rest_init(); | |
315 | } | |
316 | ||
317 | @@ -779,12 +770,9 @@ | |
318 | int i, pid; | |
319 | ||
320 | pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD); | |
321 | - if (pid > 0) { | |
322 | - while (pid != wait(&i)) { | |
323 | - current->policy |= SCHED_YIELD; | |
324 | - schedule(); | |
325 | - } | |
326 | - } | |
327 | + if (pid > 0) | |
328 | + while (pid != wait(&i)) | |
329 | + yield(); | |
330 | if (MAJOR(real_root_dev) != RAMDISK_MAJOR | |
331 | || MINOR(real_root_dev) != 0) { | |
332 | error = change_root(real_root_dev,"/initrd"); | |
333 | --- linux/kernel/sched.c.orig Sun Jan 6 13:55:57 2002 | |
334 | +++ linux/kernel/sched.c Mon Jan 28 18:41:54 2002 | |
335 | @@ -12,333 +12,249 @@ | |
336 | * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar | |
337 | */ | |
338 | ||
339 | -/* | |
340 | - * 'sched.c' is the main kernel file. It contains scheduling primitives | |
341 | - * (sleep_on, wakeup, schedule etc) as well as a number of simple system | |
342 | - * call functions (type getpid()), which just extract a field from | |
343 | - * current-task | |
344 | - */ | |
345 | - | |
346 | -#include <linux/config.h> | |
347 | #include <linux/mm.h> | |
348 | +#include <linux/nmi.h> | |
349 | #include <linux/init.h> | |
350 | +#include <asm/uaccess.h> | |
351 | #include <linux/smp_lock.h> | |
352 | -#include <linux/nmi.h> | |
353 | #include <linux/interrupt.h> | |
354 | -#include <linux/kernel_stat.h> | |
355 | -#include <linux/completion.h> | |
356 | -#include <linux/prefetch.h> | |
357 | -#include <linux/compiler.h> | |
358 | - | |
359 | -#include <asm/uaccess.h> | |
360 | #include <asm/mmu_context.h> | |
361 | ||
362 | -extern void timer_bh(void); | |
363 | -extern void tqueue_bh(void); | |
364 | -extern void immediate_bh(void); | |
365 | - | |
366 | -/* | |
367 | - * scheduler variables | |
368 | - */ | |
369 | +#define BITMAP_SIZE ((((MAX_PRIO+7)/8)+sizeof(long)-1)/sizeof(long)) | |
370 | ||
371 | -unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ | |
372 | +typedef struct runqueue runqueue_t; | |
373 | ||
374 | -extern void mem_use(void); | |
375 | +struct prio_array { | |
376 | + int nr_active; | |
377 | + spinlock_t *lock; | |
378 | + runqueue_t *rq; | |
379 | + unsigned long bitmap[BITMAP_SIZE]; | |
380 | + list_t queue[MAX_PRIO]; | |
381 | +}; | |
382 | ||
383 | /* | |
384 | - * Scheduling quanta. | |
385 | + * This is the main, per-CPU runqueue data structure. | |
386 | * | |
387 | - * NOTE! The unix "nice" value influences how long a process | |
388 | - * gets. The nice value ranges from -20 to +19, where a -20 | |
389 | - * is a "high-priority" task, and a "+10" is a low-priority | |
390 | - * task. | |
391 | - * | |
392 | - * We want the time-slice to be around 50ms or so, so this | |
393 | - * calculation depends on the value of HZ. | |
394 | + * Locking rule: those places that want to lock multiple runqueues | |
395 | + * (such as the load balancing or the process migration code), lock | |
396 | + * acquire operations must be ordered by ascending &runqueue. | |
397 | */ | |
398 | -#if HZ < 200 | |
399 | -#define TICK_SCALE(x) ((x) >> 2) | |
400 | -#elif HZ < 400 | |
401 | -#define TICK_SCALE(x) ((x) >> 1) | |
402 | -#elif HZ < 800 | |
403 | -#define TICK_SCALE(x) (x) | |
404 | -#elif HZ < 1600 | |
405 | -#define TICK_SCALE(x) ((x) << 1) | |
406 | -#else | |
407 | -#define TICK_SCALE(x) ((x) << 2) | |
408 | -#endif | |
409 | +struct runqueue { | |
410 | + spinlock_t lock; | |
411 | + unsigned long nr_running, nr_switches, expired_timestamp; | |
412 | + task_t *curr, *idle; | |
413 | + prio_array_t *active, *expired, arrays[2]; | |
414 | + int prev_nr_running[NR_CPUS]; | |
415 | +} ____cacheline_aligned; | |
416 | ||
417 | -#define NICE_TO_TICKS(nice) (TICK_SCALE(20-(nice))+1) | |
418 | +static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; | |
419 | ||
420 | +#define cpu_rq(cpu) (runqueues + (cpu)) | |
421 | +#define this_rq() cpu_rq(smp_processor_id()) | |
422 | +#define task_rq(p) cpu_rq((p)->cpu) | |
423 | +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) | |
424 | +#define rt_task(p) ((p)->policy != SCHED_OTHER) | |
425 | + | |
426 | + | |
427 | +static inline runqueue_t *lock_task_rq(task_t *p, unsigned long *flags) | |
428 | +{ | |
429 | + struct runqueue *__rq; | |
430 | + | |
431 | +repeat_lock_task: | |
432 | + __rq = task_rq(p); | |
433 | + spin_lock_irqsave(&__rq->lock, *flags); | |
434 | + if (unlikely(__rq != task_rq(p))) { | |
435 | + spin_unlock_irqrestore(&__rq->lock, *flags); | |
436 | + goto repeat_lock_task; | |
437 | + } | |
438 | + return __rq; | |
439 | +} | |
440 | ||
441 | +static inline void unlock_task_rq(runqueue_t *rq, unsigned long *flags) | |
442 | +{ | |
443 | + spin_unlock_irqrestore(&rq->lock, *flags); | |
444 | +} | |
445 | /* | |
446 | - * Init task must be ok at boot for the ix86 as we will check its signals | |
447 | - * via the SMP irq return path. | |
448 | + * Adding/removing a task to/from a priority array: | |
449 | */ | |
450 | - | |
451 | -struct task_struct * init_tasks[NR_CPUS] = {&init_task, }; | |
452 | +static inline void dequeue_task(struct task_struct *p, prio_array_t *array) | |
453 | +{ | |
454 | + array->nr_active--; | |
455 | + list_del_init(&p->run_list); | |
456 | + if (list_empty(array->queue + p->prio)) | |
457 | + __clear_bit(p->prio, array->bitmap); | |
458 | +} | |
459 | + | |
460 | +static inline void enqueue_task(struct task_struct *p, prio_array_t *array) | |
461 | +{ | |
462 | + list_add_tail(&p->run_list, array->queue + p->prio); | |
463 | + __set_bit(p->prio, array->bitmap); | |
464 | + array->nr_active++; | |
465 | + p->array = array; | |
466 | +} | |
467 | ||
468 | /* | |
469 | - * The tasklist_lock protects the linked list of processes. | |
470 | + * A task is 'heavily interactive' if it either has reached the | |
471 | + * bottom 25% of the SCHED_OTHER priority range, or if it is below | |
472 | + * its default priority by at least 3 priority levels. In this | |
473 | + * case we favor it by reinserting it on the active array, | |
474 | + * even after it expired its current timeslice. | |
475 | * | |
476 | - * The runqueue_lock locks the parts that actually access | |
477 | - * and change the run-queues, and have to be interrupt-safe. | |
478 | + * A task is a 'CPU hog' if it's either in the upper 25% of the | |
479 | + * SCHED_OTHER priority range, or if's not an interactive task. | |
480 | * | |
481 | - * If both locks are to be concurrently held, the runqueue_lock | |
482 | - * nests inside the tasklist_lock. | |
483 | + * A task can get a priority bonus by being 'somewhat | |
484 | + * interactive' - and it will get a priority penalty for | |
485 | + * being a CPU hog. | |
486 | * | |
487 | - * task->alloc_lock nests inside tasklist_lock. | |
488 | - */ | |
489 | -spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */ | |
490 | -rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ | |
491 | - | |
492 | -static LIST_HEAD(runqueue_head); | |
493 | - | |
494 | -/* | |
495 | - * We align per-CPU scheduling data on cacheline boundaries, | |
496 | - * to prevent cacheline ping-pong. | |
497 | */ | |
498 | -static union { | |
499 | - struct schedule_data { | |
500 | - struct task_struct * curr; | |
501 | - cycles_t last_schedule; | |
502 | - } schedule_data; | |
503 | - char __pad [SMP_CACHE_BYTES]; | |
504 | -} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}}; | |
505 | - | |
506 | -#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr | |
507 | -#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule | |
508 | - | |
509 | -struct kernel_stat kstat; | |
510 | -extern struct task_struct *child_reaper; | |
511 | - | |
512 | -#ifdef CONFIG_SMP | |
513 | - | |
514 | -#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)]) | |
515 | -#define can_schedule(p,cpu) \ | |
516 | - ((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu)) | |
517 | - | |
518 | -#else | |
519 | - | |
520 | -#define idle_task(cpu) (&init_task) | |
521 | -#define can_schedule(p,cpu) (1) | |
522 | - | |
523 | -#endif | |
524 | ||
525 | -void scheduling_functions_start_here(void) { } | |
526 | +#define PRIO_INTERACTIVE \ | |
527 | + (MAX_RT_PRIO + MAX_USER_PRIO*PRIO_INTERACTIVE_RATIO/100) | |
528 | +#define PRIO_CPU_HOG \ | |
529 | + (MAX_RT_PRIO + MAX_USER_PRIO*PRIO_CPU_HOG_RATIO/100) | |
530 | + | |
531 | +#define TASK_INTERACTIVE(p) \ | |
532 | + (((p)->prio <= PRIO_INTERACTIVE) || \ | |
533 | + (((p)->prio < PRIO_CPU_HOG) && \ | |
534 | + ((p)->prio <= NICE_TO_PRIO((p)->__nice) - INTERACTIVE_DELTA))) | |
535 | ||
536 | /* | |
537 | - * This is the function that decides how desirable a process is.. | |
538 | - * You can weigh different processes against each other depending | |
539 | - * on what CPU they've run on lately etc to try to handle cache | |
540 | - * and TLB miss penalties. | |
541 | + * We place interactive tasks back into the active array, if possible. | |
542 | * | |
543 | - * Return values: | |
544 | - * -1000: never select this | |
545 | - * 0: out of time, recalculate counters (but it might still be | |
546 | - * selected) | |
547 | - * +ve: "goodness" value (the larger, the better) | |
548 | - * +1000: realtime process, select this. | |
549 | + * To guarantee that this does not starve expired tasks we ignore the | |
550 | + * interactivity of a task if the first expired task had to wait more | |
551 | + * than a 'reasonable' amount of time. This deadline timeout is | |
552 | + * load-dependent, as the frequency of array switched decreases with | |
553 | + * increasing number of running tasks: | |
554 | */ | |
555 | +#define EXPIRED_STARVING(rq) \ | |
556 | + ((rq)->expired_timestamp && \ | |
557 | + (jiffies - (rq)->expired_timestamp >= \ | |
558 | + STARVATION_LIMIT * ((rq)->nr_running) + 1)) | |
559 | ||
560 | -static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm) | |
561 | +static inline int effective_prio(task_t *p) | |
562 | { | |
563 | - int weight; | |
564 | + int bonus, prio; | |
565 | ||
566 | /* | |
567 | - * select the current process after every other | |
568 | - * runnable process, but before the idle thread. | |
569 | - * Also, dont trigger a counter recalculation. | |
570 | + * Here we scale the actual sleep average [0 .... MAX_SLEEP_AVG] | |
571 | + * into the -14 ... +14 bonus/penalty range. | |
572 | + * | |
573 | + * We use 70% of the full 0...39 priority range so that: | |
574 | + * | |
575 | + * 1) nice +19 CPU hogs do not preempt nice 0 CPU hogs. | |
576 | + * 2) nice -20 interactive tasks do not get preempted by | |
577 | + * nice 0 interactive tasks. | |
578 | + * | |
579 | + * Both properties are important to certain workloads. | |
580 | */ | |
581 | - weight = -1; | |
582 | - if (p->policy & SCHED_YIELD) | |
583 | - goto out; | |
584 | + bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 - | |
585 | + MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2; | |
586 | ||
587 | - /* | |
588 | - * Non-RT process - normal case first. | |
589 | - */ | |
590 | - if (p->policy == SCHED_OTHER) { | |
591 | + prio = NICE_TO_PRIO(p->__nice) - bonus; | |
592 | + if (prio < MAX_RT_PRIO) | |
593 | + prio = MAX_RT_PRIO; | |
594 | + if (prio > MAX_PRIO-1) | |
595 | + prio = MAX_PRIO-1; | |
596 | + return prio; | |
597 | +} | |
598 | + | |
599 | +static inline void activate_task(task_t *p, runqueue_t *rq) | |
600 | +{ | |
601 | + unsigned long sleep_time = jiffies - p->sleep_timestamp; | |
602 | + prio_array_t *array = rq->active; | |
603 | + | |
604 | + if (!rt_task(p) && sleep_time) { | |
605 | /* | |
606 | - * Give the process a first-approximation goodness value | |
607 | - * according to the number of clock-ticks it has left. | |
608 | - * | |
609 | - * Don't do any other calculations if the time slice is | |
610 | - * over.. | |
611 | + * This code gives a bonus to interactive tasks. We update | |
612 | + * an 'average sleep time' value here, based on | |
613 | + * sleep_timestamp. The more time a task spends sleeping, | |
614 | + * the higher the average gets - and the higher the priority | |
615 | + * boost gets as well. | |
616 | */ | |
617 | - weight = p->counter; | |
618 | - if (!weight) | |
619 | - goto out; | |
620 | - | |
621 | -#ifdef CONFIG_SMP | |
622 | - /* Give a largish advantage to the same processor... */ | |
623 | - /* (this is equivalent to penalizing other processors) */ | |
624 | - if (p->processor == this_cpu) | |
625 | - weight += PROC_CHANGE_PENALTY; | |
626 | -#endif | |
627 | - | |
628 | - /* .. and a slight advantage to the current MM */ | |
629 | - if (p->mm == this_mm || !p->mm) | |
630 | - weight += 1; | |
631 | - weight += 20 - p->nice; | |
632 | - goto out; | |
633 | + p->sleep_avg += sleep_time; | |
634 | + if (p->sleep_avg > MAX_SLEEP_AVG) | |
635 | + p->sleep_avg = MAX_SLEEP_AVG; | |
636 | + p->prio = effective_prio(p); | |
637 | } | |
638 | + enqueue_task(p, array); | |
639 | + rq->nr_running++; | |
640 | +} | |
641 | ||
642 | - /* | |
643 | - * Realtime process, select the first one on the | |
644 | - * runqueue (taking priorities within processes | |
645 | - * into account). | |
646 | - */ | |
647 | - weight = 1000 + p->rt_priority; | |
648 | -out: | |
649 | - return weight; | |
650 | +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) | |
651 | +{ | |
652 | + rq->nr_running--; | |
653 | + dequeue_task(p, p->array); | |
654 | + p->array = NULL; | |
655 | + p->sleep_timestamp = jiffies; | |
656 | } | |
657 | ||
658 | -/* | |
659 | - * the 'goodness value' of replacing a process on a given CPU. | |
660 | - * positive value means 'replace', zero or negative means 'dont'. | |
661 | - */ | |
662 | -static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu) | |
663 | +static inline void resched_task(task_t *p) | |
664 | { | |
665 | - return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm); | |
666 | + int need_resched; | |
667 | + | |
668 | + need_resched = p->need_resched; | |
669 | + wmb(); | |
670 | + p->need_resched = 1; | |
671 | + if (!need_resched && (p->cpu != smp_processor_id())) | |
672 | + smp_send_reschedule(p->cpu); | |
673 | } | |
674 | ||
675 | +#ifdef CONFIG_SMP | |
676 | + | |
677 | /* | |
678 | - * This is ugly, but reschedule_idle() is very timing-critical. | |
679 | - * We are called with the runqueue spinlock held and we must | |
680 | - * not claim the tasklist_lock. | |
681 | + * Wait for a process to unschedule. This is used by the exit() and | |
682 | + * ptrace() code. | |
683 | */ | |
684 | -static FASTCALL(void reschedule_idle(struct task_struct * p)); | |
685 | - | |
686 | -static void reschedule_idle(struct task_struct * p) | |
687 | +void wait_task_inactive(task_t * p) | |
688 | { | |
689 | -#ifdef CONFIG_SMP | |
690 | - int this_cpu = smp_processor_id(); | |
691 | - struct task_struct *tsk, *target_tsk; | |
692 | - int cpu, best_cpu, i, max_prio; | |
693 | - cycles_t oldest_idle; | |
694 | + unsigned long flags; | |
695 | + runqueue_t *rq; | |
696 | ||
697 | - /* | |
698 | - * shortcut if the woken up task's last CPU is | |
699 | - * idle now. | |
700 | - */ | |
701 | - best_cpu = p->processor; | |
702 | - if (can_schedule(p, best_cpu)) { | |
703 | - tsk = idle_task(best_cpu); | |
704 | - if (cpu_curr(best_cpu) == tsk) { | |
705 | - int need_resched; | |
706 | -send_now_idle: | |
707 | - /* | |
708 | - * If need_resched == -1 then we can skip sending | |
709 | - * the IPI altogether, tsk->need_resched is | |
710 | - * actively watched by the idle thread. | |
711 | - */ | |
712 | - need_resched = tsk->need_resched; | |
713 | - tsk->need_resched = 1; | |
714 | - if ((best_cpu != this_cpu) && !need_resched) | |
715 | - smp_send_reschedule(best_cpu); | |
716 | - return; | |
717 | - } | |
718 | +repeat: | |
719 | + rq = task_rq(p); | |
720 | + while (unlikely(rq->curr == p)) { | |
721 | + cpu_relax(); | |
722 | + barrier(); | |
723 | + } | |
724 | + rq = lock_task_rq(p, &flags); | |
725 | + if (unlikely(rq->curr == p)) { | |
726 | + unlock_task_rq(rq, &flags); | |
727 | + goto repeat; | |
728 | } | |
729 | - | |
730 | - /* | |
731 | - * We know that the preferred CPU has a cache-affine current | |
732 | - * process, lets try to find a new idle CPU for the woken-up | |
733 | - * process. Select the least recently active idle CPU. (that | |
734 | - * one will have the least active cache context.) Also find | |
735 | - * the executing process which has the least priority. | |
736 | - */ | |
737 | - oldest_idle = (cycles_t) -1; | |
738 | - target_tsk = NULL; | |
739 | - max_prio = 0; | |
740 | - | |
741 | - for (i = 0; i < smp_num_cpus; i++) { | |
742 | - cpu = cpu_logical_map(i); | |
743 | - if (!can_schedule(p, cpu)) | |
744 | - continue; | |
745 | - tsk = cpu_curr(cpu); | |
746 | - /* | |
747 | - * We use the first available idle CPU. This creates | |
748 | - * a priority list between idle CPUs, but this is not | |
749 | - * a problem. | |
750 | - */ | |
751 | - if (tsk == idle_task(cpu)) { | |
752 | -#if defined(__i386__) && defined(CONFIG_SMP) | |
753 | - /* | |
754 | - * Check if two siblings are idle in the same | |
755 | - * physical package. Use them if found. | |
756 | - */ | |
757 | - if (smp_num_siblings == 2) { | |
758 | - if (cpu_curr(cpu_sibling_map[cpu]) == | |
759 | - idle_task(cpu_sibling_map[cpu])) { | |
760 | - oldest_idle = last_schedule(cpu); | |
761 | - target_tsk = tsk; | |
762 | - break; | |
763 | - } | |
764 | - | |
765 | - } | |
766 | -#endif | |
767 | - if (last_schedule(cpu) < oldest_idle) { | |
768 | - oldest_idle = last_schedule(cpu); | |
769 | - target_tsk = tsk; | |
770 | - } | |
771 | - } else { | |
772 | - if (oldest_idle == -1ULL) { | |
773 | - int prio = preemption_goodness(tsk, p, cpu); | |
774 | - | |
775 | - if (prio > max_prio) { | |
776 | - max_prio = prio; | |
777 | - target_tsk = tsk; | |
778 | - } | |
779 | - } | |
780 | - } | |
781 | - } | |
782 | - tsk = target_tsk; | |
783 | - if (tsk) { | |
784 | - if (oldest_idle != -1ULL) { | |
785 | - best_cpu = tsk->processor; | |
786 | - goto send_now_idle; | |
787 | - } | |
788 | - tsk->need_resched = 1; | |
789 | - if (tsk->processor != this_cpu) | |
790 | - smp_send_reschedule(tsk->processor); | |
791 | - } | |
792 | - return; | |
793 | - | |
794 | - | |
795 | -#else /* UP */ | |
796 | - int this_cpu = smp_processor_id(); | |
797 | - struct task_struct *tsk; | |
798 | - | |
799 | - tsk = cpu_curr(this_cpu); | |
800 | - if (preemption_goodness(tsk, p, this_cpu) > 0) | |
801 | - tsk->need_resched = 1; | |
802 | -#endif | |
803 | + unlock_task_rq(rq, &flags); | |
804 | } | |
805 | ||
806 | /* | |
807 | - * Careful! | |
808 | + * The SMP message passing code calls this function whenever | |
809 | + * the new task has arrived at the target CPU. We move the | |
810 | + * new task into the local runqueue. | |
811 | * | |
812 | - * This has to add the process to the _beginning_ of the | |
813 | - * run-queue, not the end. See the comment about "This is | |
814 | - * subtle" in the scheduler proper.. | |
815 | + * This function must be called with interrupts disabled. | |
816 | */ | |
817 | -static inline void add_to_runqueue(struct task_struct * p) | |
818 | +void sched_task_migrated(task_t *new_task) | |
819 | { | |
820 | - list_add(&p->run_list, &runqueue_head); | |
821 | - nr_running++; | |
822 | + wait_task_inactive(new_task); | |
823 | + new_task->cpu = smp_processor_id(); | |
824 | + wake_up_process(new_task); | |
825 | } | |
826 | ||
827 | -static inline void move_last_runqueue(struct task_struct * p) | |
828 | -{ | |
829 | - list_del(&p->run_list); | |
830 | - list_add_tail(&p->run_list, &runqueue_head); | |
831 | -} | |
832 | - | |
833 | -static inline void move_first_runqueue(struct task_struct * p) | |
834 | +/* | |
835 | + * Kick the remote CPU if the task is running currently, | |
836 | + * this code is used by the signal code to signal tasks | |
837 | + * which are in user-mode as quickly as possible. | |
838 | + * | |
839 | + * (Note that we do this lockless - if the task does anything | |
840 | + * while the message is in flight then it will notice the | |
841 | + * sigpending condition anyway.) | |
842 | + */ | |
843 | +void kick_if_running(task_t * p) | |
844 | { | |
845 | - list_del(&p->run_list); | |
846 | - list_add(&p->run_list, &runqueue_head); | |
847 | + if (p == task_rq(p)->curr) | |
848 | + resched_task(p); | |
849 | } | |
850 | +#endif | |
851 | ||
852 | /* | |
853 | * Wake up a process. Put it on the run-queue if it's not | |
854 | @@ -348,392 +264,472 @@ | |
855 | * "current->state = TASK_RUNNING" to mark yourself runnable | |
856 | * without the overhead of this. | |
857 | */ | |
858 | -static inline int try_to_wake_up(struct task_struct * p, int synchronous) | |
859 | +static int try_to_wake_up(task_t * p, int synchronous) | |
860 | { | |
861 | unsigned long flags; | |
862 | int success = 0; | |
863 | + runqueue_t *rq; | |
864 | ||
865 | - /* | |
866 | - * We want the common case fall through straight, thus the goto. | |
867 | - */ | |
868 | - spin_lock_irqsave(&runqueue_lock, flags); | |
869 | + rq = lock_task_rq(p, &flags); | |
870 | p->state = TASK_RUNNING; | |
871 | - if (task_on_runqueue(p)) | |
872 | - goto out; | |
873 | - add_to_runqueue(p); | |
874 | - if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id()))) | |
875 | - reschedule_idle(p); | |
876 | - success = 1; | |
877 | -out: | |
878 | - spin_unlock_irqrestore(&runqueue_lock, flags); | |
879 | + if (!p->array) { | |
880 | + activate_task(p, rq); | |
881 | + if ((rq->curr == rq->idle) || (p->prio < rq->curr->prio)) | |
882 | + resched_task(rq->curr); | |
883 | + success = 1; | |
884 | + } | |
885 | + unlock_task_rq(rq, &flags); | |
886 | return success; | |
887 | } | |
888 | ||
889 | -inline int wake_up_process(struct task_struct * p) | |
890 | +int wake_up_process(task_t * p) | |
891 | { | |
892 | return try_to_wake_up(p, 0); | |
893 | } | |
894 | ||
895 | -static void process_timeout(unsigned long __data) | |
896 | +void wake_up_forked_process(task_t * p) | |
897 | { | |
898 | - struct task_struct * p = (struct task_struct *) __data; | |
899 | + runqueue_t *rq = this_rq(); | |
900 | ||
901 | - wake_up_process(p); | |
902 | + p->state = TASK_RUNNING; | |
903 | + if (!rt_task(p)) { | |
904 | + current->sleep_avg = current->sleep_avg * PARENT_FORK_PENALTY / 100; | |
905 | + p->sleep_avg = p->sleep_avg * CHILD_FORK_PENALTY / 100; | |
906 | + p->prio = effective_prio(p); | |
907 | + } | |
908 | + spin_lock_irq(&rq->lock); | |
909 | + p->cpu = smp_processor_id(); | |
910 | + activate_task(p, rq); | |
911 | + spin_unlock_irq(&rq->lock); | |
912 | } | |
913 | ||
914 | -/** | |
915 | - * schedule_timeout - sleep until timeout | |
916 | - * @timeout: timeout value in jiffies | |
917 | - * | |
918 | - * Make the current task sleep until @timeout jiffies have | |
919 | - * elapsed. The routine will return immediately unless | |
920 | - * the current task state has been set (see set_current_state()). | |
921 | - * | |
922 | - * You can set the task state as follows - | |
923 | - * | |
924 | - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to | |
925 | - * pass before the routine returns. The routine will return 0 | |
926 | - * | |
927 | - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is | |
928 | - * delivered to the current task. In this case the remaining time | |
929 | - * in jiffies will be returned, or 0 if the timer expired in time | |
930 | - * | |
931 | - * The current task state is guaranteed to be TASK_RUNNING when this | |
932 | - * routine returns. | |
933 | - * | |
934 | - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule | |
935 | - * the CPU away without a bound on the timeout. In this case the return | |
936 | - * value will be %MAX_SCHEDULE_TIMEOUT. | |
937 | - * | |
938 | - * In all cases the return value is guaranteed to be non-negative. | |
939 | - */ | |
940 | -signed long schedule_timeout(signed long timeout) | |
941 | +asmlinkage void schedule_tail(task_t *prev) | |
942 | { | |
943 | - struct timer_list timer; | |
944 | - unsigned long expire; | |
945 | - | |
946 | - switch (timeout) | |
947 | - { | |
948 | - case MAX_SCHEDULE_TIMEOUT: | |
949 | - /* | |
950 | - * These two special cases are useful to be comfortable | |
951 | - * in the caller. Nothing more. We could take | |
952 | - * MAX_SCHEDULE_TIMEOUT from one of the negative value | |
953 | - * but I' d like to return a valid offset (>=0) to allow | |
954 | - * the caller to do everything it want with the retval. | |
955 | - */ | |
956 | - schedule(); | |
957 | - goto out; | |
958 | - default: | |
959 | - /* | |
960 | - * Another bit of PARANOID. Note that the retval will be | |
961 | - * 0 since no piece of kernel is supposed to do a check | |
962 | - * for a negative retval of schedule_timeout() (since it | |
963 | - * should never happens anyway). You just have the printk() | |
964 | - * that will tell you if something is gone wrong and where. | |
965 | - */ | |
966 | - if (timeout < 0) | |
967 | - { | |
968 | - printk(KERN_ERR "schedule_timeout: wrong timeout " | |
969 | - "value %lx from %p\n", timeout, | |
970 | - __builtin_return_address(0)); | |
971 | - current->state = TASK_RUNNING; | |
972 | - goto out; | |
973 | - } | |
974 | - } | |
975 | + spin_unlock_irq(&this_rq()->lock); | |
976 | +} | |
977 | ||
978 | - expire = timeout + jiffies; | |
979 | +static inline void context_switch(task_t *prev, task_t *next) | |
980 | +{ | |
981 | + struct mm_struct *mm = next->mm; | |
982 | + struct mm_struct *oldmm = prev->active_mm; | |
983 | ||
984 | - init_timer(&timer); | |
985 | - timer.expires = expire; | |
986 | - timer.data = (unsigned long) current; | |
987 | - timer.function = process_timeout; | |
988 | + prepare_to_switch(); | |
989 | ||
990 | - add_timer(&timer); | |
991 | - schedule(); | |
992 | - del_timer_sync(&timer); | |
993 | + if (unlikely(!mm)) { | |
994 | + next->active_mm = oldmm; | |
995 | + atomic_inc(&oldmm->mm_count); | |
996 | + enter_lazy_tlb(oldmm, next, smp_processor_id()); | |
997 | + } else | |
998 | + switch_mm(oldmm, mm, next, smp_processor_id()); | |
999 | ||
1000 | - timeout = expire - jiffies; | |
1001 | + if (unlikely(!prev->mm)) { | |
1002 | + prev->active_mm = NULL; | |
1003 | + mmdrop(oldmm); | |
1004 | + } | |
1005 | ||
1006 | - out: | |
1007 | - return timeout < 0 ? 0 : timeout; | |
1008 | + /* | |
1009 | + * Here we just switch the register state and the stack. There are | |
1010 | + * 3 processes affected by a context switch: | |
1011 | + * | |
1012 | + * prev ==> .... ==> (last => next) | |
1013 | + * | |
1014 | + * It's the 'much more previous' 'prev' that is on next's stack, | |
1015 | + * but prev is set to (the just run) 'last' process by switch_to(). | |
1016 | + * This might sound slightly confusing but makes tons of sense. | |
1017 | + */ | |
1018 | + switch_to(prev, next, prev); | |
1019 | } | |
1020 | ||
1021 | -/* | |
1022 | - * schedule_tail() is getting called from the fork return path. This | |
1023 | - * cleans up all remaining scheduler things, without impacting the | |
1024 | - * common case. | |
1025 | - */ | |
1026 | -static inline void __schedule_tail(struct task_struct *prev) | |
1027 | +unsigned long nr_running(void) | |
1028 | { | |
1029 | -#ifdef CONFIG_SMP | |
1030 | - int policy; | |
1031 | - | |
1032 | - /* | |
1033 | - * prev->policy can be written from here only before `prev' | |
1034 | - * can be scheduled (before setting prev->cpus_runnable to ~0UL). | |
1035 | - * Of course it must also be read before allowing prev | |
1036 | - * to be rescheduled, but since the write depends on the read | |
1037 | - * to complete, wmb() is enough. (the spin_lock() acquired | |
1038 | - * before setting cpus_runnable is not enough because the spin_lock() | |
1039 | - * common code semantics allows code outside the critical section | |
1040 | - * to enter inside the critical section) | |
1041 | - */ | |
1042 | - policy = prev->policy; | |
1043 | - prev->policy = policy & ~SCHED_YIELD; | |
1044 | - wmb(); | |
1045 | + unsigned long i, sum = 0; | |
1046 | ||
1047 | - /* | |
1048 | - * fast path falls through. We have to clear cpus_runnable before | |
1049 | - * checking prev->state to avoid a wakeup race. Protect against | |
1050 | - * the task exiting early. | |
1051 | - */ | |
1052 | - task_lock(prev); | |
1053 | - task_release_cpu(prev); | |
1054 | - mb(); | |
1055 | - if (prev->state == TASK_RUNNING) | |
1056 | - goto needs_resched; | |
1057 | + for (i = 0; i < smp_num_cpus; i++) | |
1058 | + sum += cpu_rq(cpu_logical_map(i))->nr_running; | |
1059 | ||
1060 | -out_unlock: | |
1061 | - task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */ | |
1062 | - return; | |
1063 | + return sum; | |
1064 | +} | |
1065 | ||
1066 | - /* | |
1067 | - * Slow path - we 'push' the previous process and | |
1068 | - * reschedule_idle() will attempt to find a new | |
1069 | - * processor for it. (but it might preempt the | |
1070 | - * current process as well.) We must take the runqueue | |
1071 | - * lock and re-check prev->state to be correct. It might | |
1072 | - * still happen that this process has a preemption | |
1073 | - * 'in progress' already - but this is not a problem and | |
1074 | - * might happen in other circumstances as well. | |
1075 | - */ | |
1076 | -needs_resched: | |
1077 | - { | |
1078 | - unsigned long flags; | |
1079 | +unsigned long nr_context_switches(void) | |
1080 | +{ | |
1081 | + unsigned long i, sum = 0; | |
1082 | ||
1083 | - /* | |
1084 | - * Avoid taking the runqueue lock in cases where | |
1085 | - * no preemption-check is necessery: | |
1086 | - */ | |
1087 | - if ((prev == idle_task(smp_processor_id())) || | |
1088 | - (policy & SCHED_YIELD)) | |
1089 | - goto out_unlock; | |
1090 | + for (i = 0; i < smp_num_cpus; i++) | |
1091 | + sum += cpu_rq(cpu_logical_map(i))->nr_switches; | |
1092 | ||
1093 | - spin_lock_irqsave(&runqueue_lock, flags); | |
1094 | - if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev)) | |
1095 | - reschedule_idle(prev); | |
1096 | - spin_unlock_irqrestore(&runqueue_lock, flags); | |
1097 | - goto out_unlock; | |
1098 | - } | |
1099 | -#else | |
1100 | - prev->policy &= ~SCHED_YIELD; | |
1101 | -#endif /* CONFIG_SMP */ | |
1102 | + return sum; | |
1103 | } | |
1104 | ||
1105 | -asmlinkage void schedule_tail(struct task_struct *prev) | |
1106 | +#if CONFIG_SMP | |
1107 | +/* | |
1108 | + * Lock the busiest runqueue as well, this_rq is locked already. | |
1109 | + * Recalculate nr_running if we have to drop the runqueue lock. | |
1110 | + */ | |
1111 | +static inline unsigned int double_lock_balance(runqueue_t *this_rq, | |
1112 | + runqueue_t *busiest, int this_cpu, int idle, unsigned int nr_running) | |
1113 | { | |
1114 | - __schedule_tail(prev); | |
1115 | + if (unlikely(!spin_trylock(&busiest->lock))) { | |
1116 | + if (busiest < this_rq) { | |
1117 | + spin_unlock(&this_rq->lock); | |
1118 | + spin_lock(&busiest->lock); | |
1119 | + spin_lock(&this_rq->lock); | |
1120 | + /* Need to recalculate nr_running */ | |
1121 | + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) | |
1122 | + nr_running = this_rq->nr_running; | |
1123 | + else | |
1124 | + nr_running = this_rq->prev_nr_running[this_cpu]; | |
1125 | + } else | |
1126 | + spin_lock(&busiest->lock); | |
1127 | + } | |
1128 | + return nr_running; | |
1129 | } | |
1130 | ||
1131 | /* | |
1132 | - * 'schedule()' is the scheduler function. It's a very simple and nice | |
1133 | - * scheduler: it's not perfect, but certainly works for most things. | |
1134 | - * | |
1135 | - * The goto is "interesting". | |
1136 | + * Current runqueue is empty, or rebalance tick: if there is an | |
1137 | + * inbalance (current runqueue is too short) then pull from | |
1138 | + * busiest runqueue(s). | |
1139 | * | |
1140 | - * NOTE!! Task 0 is the 'idle' task, which gets called when no other | |
1141 | - * tasks can run. It can not be killed, and it cannot sleep. The 'state' | |
1142 | - * information in task[0] is never used. | |
1143 | + * We call this with the current runqueue locked, | |
1144 | + * irqs disabled. | |
1145 | */ | |
1146 | -asmlinkage void schedule(void) | |
1147 | +static void load_balance(runqueue_t *this_rq, int idle) | |
1148 | { | |
1149 | - struct schedule_data * sched_data; | |
1150 | - struct task_struct *prev, *next, *p; | |
1151 | - struct list_head *tmp; | |
1152 | - int this_cpu, c; | |
1153 | + int imbalance, nr_running, load, max_load, | |
1154 | + idx, i, this_cpu = smp_processor_id(); | |
1155 | + task_t *next = this_rq->idle, *tmp; | |
1156 | + runqueue_t *busiest, *rq_src; | |
1157 | + prio_array_t *array; | |
1158 | + list_t *head, *curr; | |
1159 | ||
1160 | + /* | |
1161 | + * We search all runqueues to find the most busy one. | |
1162 | + * We do this lockless to reduce cache-bouncing overhead, | |
1163 | + * we re-check the 'best' source CPU later on again, with | |
1164 | + * the lock held. | |
1165 | + * | |
1166 | + * We fend off statistical fluctuations in runqueue lengths by | |
1167 | + * saving the runqueue length during the previous load-balancing | |
1168 | + * operation and using the smaller one the current and saved lengths. | |
1169 | + * If a runqueue is long enough for a longer amount of time then | |
1170 | + * we recognize it and pull tasks from it. | |
1171 | + * | |
1172 | + * The 'current runqueue length' is a statistical maximum variable, | |
1173 | + * for that one we take the longer one - to avoid fluctuations in | |
1174 | + * the other direction. So for a load-balance to happen it needs | |
1175 | + * stable long runqueue on the target CPU and stable short runqueue | |
1176 | + * on the local runqueue. | |
1177 | + * | |
1178 | + * We make an exception if this CPU is about to become idle - in | |
1179 | + * that case we are less picky about moving a task across CPUs and | |
1180 | + * take what can be taken. | |
1181 | + */ | |
1182 | + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) | |
1183 | + nr_running = this_rq->nr_running; | |
1184 | + else | |
1185 | + nr_running = this_rq->prev_nr_running[this_cpu]; | |
1186 | ||
1187 | - spin_lock_prefetch(&runqueue_lock); | |
1188 | + busiest = NULL; | |
1189 | + max_load = 1; | |
1190 | + for (i = 0; i < smp_num_cpus; i++) { | |
1191 | + rq_src = cpu_rq(cpu_logical_map(i)); | |
1192 | + if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i])) | |
1193 | + load = rq_src->nr_running; | |
1194 | + else | |
1195 | + load = this_rq->prev_nr_running[i]; | |
1196 | + this_rq->prev_nr_running[i] = rq_src->nr_running; | |
1197 | + | |
1198 | + if ((load > max_load) && (rq_src != this_rq)) { | |
1199 | + busiest = rq_src; | |
1200 | + max_load = load; | |
1201 | + } | |
1202 | + } | |
1203 | ||
1204 | - if (!current->active_mm) BUG(); | |
1205 | -need_resched_back: | |
1206 | - prev = current; | |
1207 | - this_cpu = prev->processor; | |
1208 | + if (likely(!busiest)) | |
1209 | + return; | |
1210 | ||
1211 | - if (unlikely(in_interrupt())) { | |
1212 | - printk("Scheduling in interrupt\n"); | |
1213 | - BUG(); | |
1214 | - } | |
1215 | + imbalance = (max_load - nr_running) / 2; | |
1216 | ||
1217 | - release_kernel_lock(prev, this_cpu); | |
1218 | + /* It needs an at least ~25% imbalance to trigger balancing. */ | |
1219 | + if (!idle && (imbalance < (max_load + 3)/4)) | |
1220 | + return; | |
1221 | ||
1222 | + nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running); | |
1223 | /* | |
1224 | - * 'sched_data' is protected by the fact that we can run | |
1225 | - * only one process per CPU. | |
1226 | + * Make sure nothing changed since we checked the | |
1227 | + * runqueue length. | |
1228 | */ | |
1229 | - sched_data = & aligned_data[this_cpu].schedule_data; | |
1230 | - | |
1231 | - spin_lock_irq(&runqueue_lock); | |
1232 | - | |
1233 | - /* move an exhausted RR process to be last.. */ | |
1234 | - if (unlikely(prev->policy == SCHED_RR)) | |
1235 | - if (!prev->counter) { | |
1236 | - prev->counter = NICE_TO_TICKS(prev->nice); | |
1237 | - move_last_runqueue(prev); | |
1238 | - } | |
1239 | - | |
1240 | - switch (prev->state) { | |
1241 | - case TASK_INTERRUPTIBLE: | |
1242 | - if (signal_pending(prev)) { | |
1243 | - prev->state = TASK_RUNNING; | |
1244 | - break; | |
1245 | - } | |
1246 | - default: | |
1247 | - del_from_runqueue(prev); | |
1248 | - case TASK_RUNNING:; | |
1249 | - } | |
1250 | - prev->need_resched = 0; | |
1251 | + if (busiest->nr_running <= this_rq->nr_running + 1) | |
1252 | + goto out_unlock; | |
1253 | ||
1254 | /* | |
1255 | - * this is the scheduler proper: | |
1256 | + * We first consider expired tasks. Those will likely not be | |
1257 | + * executed in the near future, and they are most likely to | |
1258 | + * be cache-cold, thus switching CPUs has the least effect | |
1259 | + * on them. | |
1260 | */ | |
1261 | + if (busiest->expired->nr_active) | |
1262 | + array = busiest->expired; | |
1263 | + else | |
1264 | + array = busiest->active; | |
1265 | ||
1266 | -repeat_schedule: | |
1267 | +new_array: | |
1268 | /* | |
1269 | - * Default process to select.. | |
1270 | + * Load-balancing does not affect RT tasks, so we start the | |
1271 | + * searching at priority 128. | |
1272 | */ | |
1273 | - next = idle_task(this_cpu); | |
1274 | - c = -1000; | |
1275 | - list_for_each(tmp, &runqueue_head) { | |
1276 | - p = list_entry(tmp, struct task_struct, run_list); | |
1277 | - if (can_schedule(p, this_cpu)) { | |
1278 | - int weight = goodness(p, this_cpu, prev->active_mm); | |
1279 | - if (weight > c) | |
1280 | - c = weight, next = p; | |
1281 | + idx = MAX_RT_PRIO; | |
1282 | +skip_bitmap: | |
1283 | + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); | |
1284 | + if (idx == MAX_PRIO) { | |
1285 | + if (array == busiest->expired) { | |
1286 | + array = busiest->active; | |
1287 | + goto new_array; | |
1288 | } | |
1289 | + goto out_unlock; | |
1290 | } | |
1291 | ||
1292 | - /* Do we need to re-calculate counters? */ | |
1293 | - if (unlikely(!c)) { | |
1294 | - struct task_struct *p; | |
1295 | - | |
1296 | - spin_unlock_irq(&runqueue_lock); | |
1297 | - read_lock(&tasklist_lock); | |
1298 | - for_each_task(p) | |
1299 | - p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice); | |
1300 | - read_unlock(&tasklist_lock); | |
1301 | - spin_lock_irq(&runqueue_lock); | |
1302 | - goto repeat_schedule; | |
1303 | + head = array->queue + idx; | |
1304 | + curr = head->prev; | |
1305 | +skip_queue: | |
1306 | + tmp = list_entry(curr, task_t, run_list); | |
1307 | + | |
1308 | + /* | |
1309 | + * We do not migrate tasks that are: | |
1310 | + * 1) running (obviously), or | |
1311 | + * 2) cannot be migrated to this CPU due to cpus_allowed, or | |
1312 | + * 3) are cache-hot on their current CPU. | |
1313 | + */ | |
1314 | + | |
1315 | +#define CAN_MIGRATE_TASK(p,rq,this_cpu) \ | |
1316 | + ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \ | |
1317 | + ((p) != (rq)->curr) && \ | |
1318 | + (tmp->cpus_allowed & (1 << (this_cpu)))) | |
1319 | + | |
1320 | + if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) { | |
1321 | + curr = curr->next; | |
1322 | + if (curr != head) | |
1323 | + goto skip_queue; | |
1324 | + idx++; | |
1325 | + goto skip_bitmap; | |
1326 | + } | |
1327 | + next = tmp; | |
1328 | + /* | |
1329 | + * take the task out of the other runqueue and | |
1330 | + * put it into this one: | |
1331 | + */ | |
1332 | + dequeue_task(next, array); | |
1333 | + busiest->nr_running--; | |
1334 | + next->cpu = this_cpu; | |
1335 | + this_rq->nr_running++; | |
1336 | + enqueue_task(next, this_rq->active); | |
1337 | + if (next->prio < current->prio) | |
1338 | + current->need_resched = 1; | |
1339 | + if (!idle && --imbalance) { | |
1340 | + if (array == busiest->expired) { | |
1341 | + array = busiest->active; | |
1342 | + goto new_array; | |
1343 | + } | |
1344 | } | |
1345 | +out_unlock: | |
1346 | + spin_unlock(&busiest->lock); | |
1347 | +} | |
1348 | + | |
1349 | +/* | |
1350 | + * One of the idle_cpu_tick() or the busy_cpu_tick() function will | |
1351 | + * gets called every timer tick, on every CPU. Our balancing action | |
1352 | + * frequency and balancing agressivity depends on whether the CPU is | |
1353 | + * idle or not. | |
1354 | + * | |
1355 | + * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on | |
1356 | + * systems with HZ=100, every 10 msecs.) | |
1357 | + */ | |
1358 | +#define BUSY_REBALANCE_TICK (HZ/4 ?: 1) | |
1359 | +#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) | |
1360 | + | |
1361 | +static inline void idle_tick(void) | |
1362 | +{ | |
1363 | + if (jiffies % IDLE_REBALANCE_TICK) | |
1364 | + return; | |
1365 | + spin_lock(&this_rq()->lock); | |
1366 | + load_balance(this_rq(), 1); | |
1367 | + spin_unlock(&this_rq()->lock); | |
1368 | +} | |
1369 | + | |
1370 | +#endif | |
1371 | + | |
1372 | +/* | |
1373 | + * This function gets called by the timer code, with HZ frequency. | |
1374 | + * We call it with interrupts disabled. | |
1375 | + */ | |
1376 | +void scheduler_tick(task_t *p) | |
1377 | +{ | |
1378 | + runqueue_t *rq = this_rq(); | |
1379 | +#if CONFIG_SMP | |
1380 | + unsigned long now = jiffies; | |
1381 | ||
1382 | + if (p == rq->idle) | |
1383 | + return idle_tick(); | |
1384 | +#endif | |
1385 | + /* Task might have expired already, but not scheduled off yet */ | |
1386 | + if (p->array != rq->active) { | |
1387 | + p->need_resched = 1; | |
1388 | + return; | |
1389 | + } | |
1390 | + spin_lock(&rq->lock); | |
1391 | + if (unlikely(rt_task(p))) { | |
1392 | + /* | |
1393 | + * RR tasks need a special form of timeslice management. | |
1394 | + * FIFO tasks have no timeslices. | |
1395 | + */ | |
1396 | + if ((p->policy == SCHED_RR) && !--p->time_slice) { | |
1397 | + p->time_slice = NICE_TO_TIMESLICE(p->__nice); | |
1398 | + p->need_resched = 1; | |
1399 | + | |
1400 | + /* put it at the end of the queue: */ | |
1401 | + dequeue_task(p, rq->active); | |
1402 | + enqueue_task(p, rq->active); | |
1403 | + } | |
1404 | + goto out; | |
1405 | + } | |
1406 | /* | |
1407 | - * from this point on nothing can prevent us from | |
1408 | - * switching to the next task, save this fact in | |
1409 | - * sched_data. | |
1410 | - */ | |
1411 | - sched_data->curr = next; | |
1412 | - task_set_cpu(next, this_cpu); | |
1413 | - spin_unlock_irq(&runqueue_lock); | |
1414 | - | |
1415 | - if (unlikely(prev == next)) { | |
1416 | - /* We won't go through the normal tail, so do this by hand */ | |
1417 | - prev->policy &= ~SCHED_YIELD; | |
1418 | - goto same_process; | |
1419 | + * The task was running during this tick - update the | |
1420 | + * time slice counter and the sleep average. Note: we | |
1421 | + * do not update a process's priority until it either | |
1422 | + * goes to sleep or uses up its timeslice. This makes | |
1423 | + * it possible for interactive tasks to use up their | |
1424 | + * timeslices at their highest priority levels. | |
1425 | + */ | |
1426 | + if (p->sleep_avg) | |
1427 | + p->sleep_avg--; | |
1428 | + if (!--p->time_slice) { | |
1429 | + dequeue_task(p, rq->active); | |
1430 | + p->need_resched = 1; | |
1431 | + p->prio = effective_prio(p); | |
1432 | + p->time_slice = NICE_TO_TIMESLICE(p->__nice); | |
1433 | + | |
1434 | + if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { | |
1435 | + if (!rq->expired_timestamp) | |
1436 | + rq->expired_timestamp = jiffies; | |
1437 | + enqueue_task(p, rq->expired); | |
1438 | + } else | |
1439 | + enqueue_task(p, rq->active); | |
1440 | } | |
1441 | +out: | |
1442 | +#if CONFIG_SMP | |
1443 | + if (!(now % BUSY_REBALANCE_TICK)) | |
1444 | + load_balance(rq, 0); | |
1445 | +#endif | |
1446 | + spin_unlock(&rq->lock); | |
1447 | +} | |
1448 | ||
1449 | -#ifdef CONFIG_SMP | |
1450 | - /* | |
1451 | - * maintain the per-process 'last schedule' value. | |
1452 | - * (this has to be recalculated even if we reschedule to | |
1453 | - * the same process) Currently this is only used on SMP, | |
1454 | - * and it's approximate, so we do not have to maintain | |
1455 | - * it while holding the runqueue spinlock. | |
1456 | - */ | |
1457 | - sched_data->last_schedule = get_cycles(); | |
1458 | +void scheduling_functions_start_here(void) { } | |
1459 | ||
1460 | - /* | |
1461 | - * We drop the scheduler lock early (it's a global spinlock), | |
1462 | - * thus we have to lock the previous process from getting | |
1463 | - * rescheduled during switch_to(). | |
1464 | - */ | |
1465 | +/* | |
1466 | + * 'schedule()' is the main scheduler function. | |
1467 | + */ | |
1468 | +asmlinkage void schedule(void) | |
1469 | +{ | |
1470 | + task_t *prev = current, *next; | |
1471 | + runqueue_t *rq = this_rq(); | |
1472 | + prio_array_t *array; | |
1473 | + list_t *queue; | |
1474 | + int idx; | |
1475 | ||
1476 | -#endif /* CONFIG_SMP */ | |
1477 | + if (unlikely(in_interrupt())) | |
1478 | + BUG(); | |
1479 | + release_kernel_lock(prev, smp_processor_id()); | |
1480 | + spin_lock_irq(&rq->lock); | |
1481 | ||
1482 | - kstat.context_swtch++; | |
1483 | - /* | |
1484 | - * there are 3 processes which are affected by a context switch: | |
1485 | - * | |
1486 | - * prev == .... ==> (last => next) | |
1487 | - * | |
1488 | - * It's the 'much more previous' 'prev' that is on next's stack, | |
1489 | - * but prev is set to (the just run) 'last' process by switch_to(). | |
1490 | - * This might sound slightly confusing but makes tons of sense. | |
1491 | - */ | |
1492 | - prepare_to_switch(); | |
1493 | - { | |
1494 | - struct mm_struct *mm = next->mm; | |
1495 | - struct mm_struct *oldmm = prev->active_mm; | |
1496 | - if (!mm) { | |
1497 | - if (next->active_mm) BUG(); | |
1498 | - next->active_mm = oldmm; | |
1499 | - atomic_inc(&oldmm->mm_count); | |
1500 | - enter_lazy_tlb(oldmm, next, this_cpu); | |
1501 | - } else { | |
1502 | - if (next->active_mm != mm) BUG(); | |
1503 | - switch_mm(oldmm, mm, next, this_cpu); | |
1504 | + switch (prev->state) { | |
1505 | + case TASK_RUNNING: | |
1506 | + prev->sleep_timestamp = jiffies; | |
1507 | + break; | |
1508 | + case TASK_INTERRUPTIBLE: | |
1509 | + if (unlikely(signal_pending(prev))) { | |
1510 | + prev->state = TASK_RUNNING; | |
1511 | + prev->sleep_timestamp = jiffies; | |
1512 | + break; | |
1513 | } | |
1514 | + default: | |
1515 | + deactivate_task(prev, rq); | |
1516 | + } | |
1517 | +#if CONFIG_SMP | |
1518 | +pick_next_task: | |
1519 | +#endif | |
1520 | + if (unlikely(!rq->nr_running)) { | |
1521 | +#if CONFIG_SMP | |
1522 | + load_balance(rq, 1); | |
1523 | + if (rq->nr_running) | |
1524 | + goto pick_next_task; | |
1525 | +#endif | |
1526 | + next = rq->idle; | |
1527 | + rq->expired_timestamp = 0; | |
1528 | + goto switch_tasks; | |
1529 | + } | |
1530 | ||
1531 | - if (!prev->mm) { | |
1532 | - prev->active_mm = NULL; | |
1533 | - mmdrop(oldmm); | |
1534 | - } | |
1535 | + array = rq->active; | |
1536 | + if (unlikely(!array->nr_active)) { | |
1537 | + /* | |
1538 | + * Switch the active and expired arrays. | |
1539 | + */ | |
1540 | + rq->active = rq->expired; | |
1541 | + rq->expired = array; | |
1542 | + array = rq->active; | |
1543 | + rq->expired_timestamp = 0; | |
1544 | } | |
1545 | ||
1546 | - /* | |
1547 | - * This just switches the register state and the | |
1548 | - * stack. | |
1549 | - */ | |
1550 | - switch_to(prev, next, prev); | |
1551 | - __schedule_tail(prev); | |
1552 | + idx = sched_find_first_bit(array->bitmap); | |
1553 | + queue = array->queue + idx; | |
1554 | + next = list_entry(queue->next, task_t, run_list); | |
1555 | + | |
1556 | +switch_tasks: | |
1557 | + prefetch(next); | |
1558 | + prev->need_resched = 0; | |
1559 | + | |
1560 | + if (likely(prev != next)) { | |
1561 | + rq->nr_switches++; | |
1562 | + rq->curr = next; | |
1563 | + context_switch(prev, next); | |
1564 | + /* | |
1565 | + * The runqueue pointer might be from another CPU | |
1566 | + * if the new task was last running on a different | |
1567 | + * CPU - thus re-load it. | |
1568 | + */ | |
1569 | + barrier(); | |
1570 | + rq = this_rq(); | |
1571 | + } | |
1572 | + spin_unlock_irq(&rq->lock); | |
1573 | ||
1574 | -same_process: | |
1575 | reacquire_kernel_lock(current); | |
1576 | - if (current->need_resched) | |
1577 | - goto need_resched_back; | |
1578 | return; | |
1579 | } | |
1580 | ||
1581 | /* | |
1582 | - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything | |
1583 | - * up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the | |
1584 | - * non-exclusive tasks and one exclusive task. | |
1585 | + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just | |
1586 | + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve | |
1587 | + * number) then we wake all the non-exclusive tasks and one exclusive task. | |
1588 | * | |
1589 | * There are circumstances in which we can try to wake a task which has already | |
1590 | - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero | |
1591 | - * in this (rare) case, and we handle it by contonuing to scan the queue. | |
1592 | + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | |
1593 | + * zero in this (rare) case, and we handle it by continuing to scan the queue. | |
1594 | */ | |
1595 | static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode, | |
1596 | int nr_exclusive, const int sync) | |
1597 | { | |
1598 | struct list_head *tmp; | |
1599 | - struct task_struct *p; | |
1600 | + task_t *p; | |
1601 | ||
1602 | - CHECK_MAGIC_WQHEAD(q); | |
1603 | - WQ_CHECK_LIST_HEAD(&q->task_list); | |
1604 | - | |
1605 | list_for_each(tmp,&q->task_list) { | |
1606 | unsigned int state; | |
1607 | - wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); | |
1608 | + wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); | |
1609 | ||
1610 | - CHECK_MAGIC(curr->__magic); | |
1611 | p = curr->task; | |
1612 | state = p->state; | |
1613 | - if (state & mode) { | |
1614 | - WQ_NOTE_WAKER(curr); | |
1615 | - if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) | |
1616 | - break; | |
1617 | - } | |
1618 | + if ((state & mode) && | |
1619 | + try_to_wake_up(p, sync) && | |
1620 | + ((curr->flags & WQ_FLAG_EXCLUSIVE) && | |
1621 | + !--nr_exclusive)) | |
1622 | + break; | |
1623 | } | |
1624 | } | |
1625 | ||
1626 | @@ -850,8 +846,70 @@ | |
1627 | return timeout; | |
1628 | } | |
1629 | ||
1630 | +/* | |
1631 | + * Change the current task's CPU affinity. Migrate the process to a | |
1632 | + * proper CPU and schedule away if the current CPU is removed from | |
1633 | + * the allowed bitmask. | |
1634 | + */ | |
1635 | +void set_cpus_allowed(task_t *p, unsigned long new_mask) | |
1636 | +{ | |
1637 | + new_mask &= cpu_online_map; | |
1638 | + if (!new_mask) | |
1639 | + BUG(); | |
1640 | + | |
1641 | + p->cpus_allowed = new_mask; | |
1642 | + /* | |
1643 | + * Can the task run on the current CPU? If not then | |
1644 | + * migrate the process off to a proper CPU. | |
1645 | + */ | |
1646 | + if (new_mask & (1UL << smp_processor_id())) | |
1647 | + return; | |
1648 | +#if CONFIG_SMP | |
1649 | + current->state = TASK_UNINTERRUPTIBLE; | |
1650 | + smp_migrate_task(__ffs(new_mask), current); | |
1651 | + | |
1652 | + schedule(); | |
1653 | +#endif | |
1654 | +} | |
1655 | + | |
1656 | void scheduling_functions_end_here(void) { } | |
1657 | ||
1658 | +void set_user_nice(task_t *p, long nice) | |
1659 | +{ | |
1660 | + unsigned long flags; | |
1661 | + prio_array_t *array; | |
1662 | + runqueue_t *rq; | |
1663 | + | |
1664 | + if (p->__nice == nice) | |
1665 | + return; | |
1666 | + /* | |
1667 | + * We have to be careful, if called from sys_setpriority(), | |
1668 | + * the task might be in the middle of scheduling on another CPU. | |
1669 | + */ | |
1670 | + rq = lock_task_rq(p, &flags); | |
1671 | + if (rt_task(p)) { | |
1672 | + p->__nice = nice; | |
1673 | + goto out_unlock; | |
1674 | + } | |
1675 | + array = p->array; | |
1676 | + if (array) | |
1677 | + dequeue_task(p, array); | |
1678 | + p->__nice = nice; | |
1679 | + p->prio = NICE_TO_PRIO(nice); | |
1680 | + if (array) { | |
1681 | + enqueue_task(p, array); | |
1682 | + /* | |
1683 | + * If the task is running and lowered its priority, | |
1684 | + * or increased its priority then reschedule its CPU: | |
1685 | + */ | |
1686 | + if ((nice < p->__nice) || | |
1687 | + ((p->__nice < nice) && (p == rq->curr))) | |
1688 | + resched_task(rq->curr); | |
1689 | + } | |
1690 | +out_unlock: | |
1691 | + unlock_task_rq(rq, &flags); | |
1692 | +} | |
1693 | + | |
1694 | #ifndef __alpha__ | |
1695 | ||
1696 | /* | |
1697 | @@ -862,7 +920,7 @@ | |
1698 | ||
1699 | asmlinkage long sys_nice(int increment) | |
1700 | { | |
1701 | - long newprio; | |
1702 | + long nice; | |
1703 | ||
1704 | /* | |
1705 | * Setpriority might change our priority at the same moment. | |
1706 | @@ -878,32 +936,30 @@ | |
1707 | if (increment > 40) | |
1708 | increment = 40; | |
1709 | ||
1710 | - newprio = current->nice + increment; | |
1711 | - if (newprio < -20) | |
1712 | - newprio = -20; | |
1713 | - if (newprio > 19) | |
1714 | - newprio = 19; | |
1715 | - current->nice = newprio; | |
1716 | + nice = current->__nice + increment; | |
1717 | + if (nice < -20) | |
1718 | + nice = -20; | |
1719 | + if (nice > 19) | |
1720 | + nice = 19; | |
1721 | + set_user_nice(current, nice); | |
1722 | return 0; | |
1723 | } | |
1724 | ||
1725 | #endif | |
1726 | ||
1727 | -static inline struct task_struct *find_process_by_pid(pid_t pid) | |
1728 | +static inline task_t *find_process_by_pid(pid_t pid) | |
1729 | { | |
1730 | - struct task_struct *tsk = current; | |
1731 | - | |
1732 | - if (pid) | |
1733 | - tsk = find_task_by_pid(pid); | |
1734 | - return tsk; | |
1735 | + return pid ? find_task_by_pid(pid) : current; | |
1736 | } | |
1737 | ||
1738 | -static int setscheduler(pid_t pid, int policy, | |
1739 | - struct sched_param *param) | |
1740 | +static int setscheduler(pid_t pid, int policy, struct sched_param *param) | |
1741 | { | |
1742 | struct sched_param lp; | |
1743 | - struct task_struct *p; | |
1744 | + prio_array_t *array; | |
1745 | + unsigned long flags; | |
1746 | + runqueue_t *rq; | |
1747 | int retval; | |
1748 | + task_t *p; | |
1749 | ||
1750 | retval = -EINVAL; | |
1751 | if (!param || pid < 0) | |
1752 | @@ -917,14 +973,19 @@ | |
1753 | * We play safe to avoid deadlocks. | |
1754 | */ | |
1755 | read_lock_irq(&tasklist_lock); | |
1756 | - spin_lock(&runqueue_lock); | |
1757 | ||
1758 | p = find_process_by_pid(pid); | |
1759 | ||
1760 | retval = -ESRCH; | |
1761 | if (!p) | |
1762 | - goto out_unlock; | |
1763 | - | |
1764 | + goto out_unlock_tasklist; | |
1765 | + | |
1766 | + /* | |
1767 | + * To be able to change p->policy safely, the apropriate | |
1768 | + * runqueue lock must be held. | |
1769 | + */ | |
1770 | + rq = lock_task_rq(p, &flags); | |
1771 | + | |
1772 | if (policy < 0) | |
1773 | policy = p->policy; | |
1774 | else { | |
1775 | @@ -945,30 +1006,36 @@ | |
1776 | goto out_unlock; | |
1777 | ||
1778 | retval = -EPERM; | |
1779 | - if ((policy == SCHED_FIFO || policy == SCHED_RR) && | |
1780 | + if ((policy == SCHED_FIFO || policy == SCHED_RR) && | |
1781 | !capable(CAP_SYS_NICE)) | |
1782 | goto out_unlock; | |
1783 | if ((current->euid != p->euid) && (current->euid != p->uid) && | |
1784 | !capable(CAP_SYS_NICE)) | |
1785 | goto out_unlock; | |
1786 | ||
1787 | + array = p->array; | |
1788 | + if (array) | |
1789 | + deactivate_task(p, task_rq(p)); | |
1790 | retval = 0; | |
1791 | p->policy = policy; | |
1792 | p->rt_priority = lp.sched_priority; | |
1793 | - if (task_on_runqueue(p)) | |
1794 | - move_first_runqueue(p); | |
1795 | - | |
1796 | - current->need_resched = 1; | |
1797 | + if (rt_task(p)) | |
1798 | + p->prio = 99-p->rt_priority; | |
1799 | + else | |
1800 | + p->prio = NICE_TO_PRIO(p->__nice); | |
1801 | + if (array) | |
1802 | + activate_task(p, task_rq(p)); | |
1803 | ||
1804 | out_unlock: | |
1805 | - spin_unlock(&runqueue_lock); | |
1806 | + unlock_task_rq(rq, &flags); | |
1807 | +out_unlock_tasklist: | |
1808 | read_unlock_irq(&tasklist_lock); | |
1809 | ||
1810 | out_nounlock: | |
1811 | return retval; | |
1812 | } | |
1813 | ||
1814 | -asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, | |
1815 | +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, | |
1816 | struct sched_param *param) | |
1817 | { | |
1818 | return setscheduler(pid, policy, param); | |
1819 | @@ -981,7 +1048,7 @@ | |
1820 | ||
1821 | asmlinkage long sys_sched_getscheduler(pid_t pid) | |
1822 | { | |
1823 | - struct task_struct *p; | |
1824 | + task_t *p; | |
1825 | int retval; | |
1826 | ||
1827 | retval = -EINVAL; | |
1828 | @@ -992,7 +1059,7 @@ | |
1829 | read_lock(&tasklist_lock); | |
1830 | p = find_process_by_pid(pid); | |
1831 | if (p) | |
1832 | - retval = p->policy & ~SCHED_YIELD; | |
1833 | + retval = p->policy; | |
1834 | read_unlock(&tasklist_lock); | |
1835 | ||
1836 | out_nounlock: | |
1837 | @@ -1001,7 +1068,7 @@ | |
1838 | ||
1839 | asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param) | |
1840 | { | |
1841 | - struct task_struct *p; | |
1842 | + task_t *p; | |
1843 | struct sched_param lp; | |
1844 | int retval; | |
1845 | ||
1846 | @@ -1032,42 +1099,38 @@ | |
1847 | ||
1848 | asmlinkage long sys_sched_yield(void) | |
1849 | { | |
1850 | + runqueue_t *rq = this_rq(); | |
1851 | + prio_array_t *array; | |
1852 | + | |
1853 | /* | |
1854 | - * Trick. sched_yield() first counts the number of truly | |
1855 | - * 'pending' runnable processes, then returns if it's | |
1856 | - * only the current processes. (This test does not have | |
1857 | - * to be atomic.) In threaded applications this optimization | |
1858 | - * gets triggered quite often. | |
1859 | + * Decrease the yielding task's priority by one, to avoid | |
1860 | + * livelocks. This priority loss is temporary, it's recovered | |
1861 | + * once the current timeslice expires. | |
1862 | + * | |
1863 | + * If priority is already MAX_PRIO-1 then we still | |
1864 | + * roundrobin the task within the runlist. | |
1865 | */ | |
1866 | - | |
1867 | - int nr_pending = nr_running; | |
1868 | - | |
1869 | -#if CONFIG_SMP | |
1870 | - int i; | |
1871 | - | |
1872 | - // Subtract non-idle processes running on other CPUs. | |
1873 | - for (i = 0; i < smp_num_cpus; i++) { | |
1874 | - int cpu = cpu_logical_map(i); | |
1875 | - if (aligned_data[cpu].schedule_data.curr != idle_task(cpu)) | |
1876 | - nr_pending--; | |
1877 | + spin_lock_irq(&rq->lock); | |
1878 | + array = current->array; | |
1879 | + /* | |
1880 | + * If the task has reached maximum priority (or is a RT task) | |
1881 | + * then just requeue the task to the end of the runqueue: | |
1882 | + */ | |
1883 | + if (likely(current->prio == MAX_PRIO-1 || rt_task(current))) { | |
1884 | + list_del(¤t->run_list); | |
1885 | + list_add_tail(¤t->run_list, array->queue + current->prio); | |
1886 | + } else { | |
1887 | + list_del(¤t->run_list); | |
1888 | + if (list_empty(array->queue + current->prio)) | |
1889 | + __clear_bit(current->prio, array->bitmap); | |
1890 | + current->prio++; | |
1891 | + list_add_tail(¤t->run_list, array->queue + current->prio); | |
1892 | + __set_bit(current->prio, array->bitmap); | |
1893 | } | |
1894 | -#else | |
1895 | - // on UP this process is on the runqueue as well | |
1896 | - nr_pending--; | |
1897 | -#endif | |
1898 | - if (nr_pending) { | |
1899 | - /* | |
1900 | - * This process can only be rescheduled by us, | |
1901 | - * so this is safe without any locking. | |
1902 | - */ | |
1903 | - if (current->policy == SCHED_OTHER) | |
1904 | - current->policy |= SCHED_YIELD; | |
1905 | - current->need_resched = 1; | |
1906 | + spin_unlock(&rq->lock); | |
1907 | + | |
1908 | + schedule(); | |
1909 | ||
1910 | - spin_lock_irq(&runqueue_lock); | |
1911 | - move_last_runqueue(current); | |
1912 | - spin_unlock_irq(&runqueue_lock); | |
1913 | - } | |
1914 | return 0; | |
1915 | } | |
1916 | ||
1917 | @@ -1105,7 +1168,7 @@ | |
1918 | asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval) | |
1919 | { | |
1920 | struct timespec t; | |
1921 | - struct task_struct *p; | |
1922 | + task_t *p; | |
1923 | int retval = -EINVAL; | |
1924 | ||
1925 | if (pid < 0) | |
1926 | @@ -1115,8 +1178,8 @@ | |
1927 | read_lock(&tasklist_lock); | |
1928 | p = find_process_by_pid(pid); | |
1929 | if (p) | |
1930 | - jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice), | |
1931 | - &t); | |
1932 | + jiffies_to_timespec(p->policy & SCHED_FIFO ? | |
1933 | + 0 : NICE_TO_TIMESLICE(p->__nice), &t); | |
1934 | read_unlock(&tasklist_lock); | |
1935 | if (p) | |
1936 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | |
1937 | @@ -1124,14 +1187,14 @@ | |
1938 | return retval; | |
1939 | } | |
1940 | ||
1941 | -static void show_task(struct task_struct * p) | |
1942 | +static void show_task(task_t * p) | |
1943 | { | |
1944 | unsigned long free = 0; | |
1945 | int state; | |
1946 | static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" }; | |
1947 | ||
1948 | printk("%-13.13s ", p->comm); | |
1949 | - state = p->state ? ffz(~p->state) + 1 : 0; | |
1950 | + state = p->state ? __ffs(p->state) + 1 : 0; | |
1951 | if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *)) | |
1952 | printk(stat_nam[state]); | |
1953 | else | |
1954 | @@ -1172,7 +1235,7 @@ | |
1955 | printk(" (NOTLB)\n"); | |
1956 | ||
1957 | { | |
1958 | - extern void show_trace_task(struct task_struct *tsk); | |
1959 | + extern void show_trace_task(task_t *tsk); | |
1960 | show_trace_task(p); | |
1961 | } | |
1962 | } | |
1963 | @@ -1194,7 +1257,7 @@ | |
1964 | ||
1965 | void show_state(void) | |
1966 | { | |
1967 | - struct task_struct *p; | |
1968 | + task_t *p; | |
1969 | ||
1970 | #if (BITS_PER_LONG == 32) | |
1971 | printk("\n" | |
1972 | @@ -1217,121 +1280,88 @@ | |
1973 | read_unlock(&tasklist_lock); | |
1974 | } | |
1975 | ||
1976 | -/** | |
1977 | - * reparent_to_init() - Reparent the calling kernel thread to the init task. | |
1978 | - * | |
1979 | - * If a kernel thread is launched as a result of a system call, or if | |
1980 | - * it ever exits, it should generally reparent itself to init so that | |
1981 | - * it is correctly cleaned up on exit. | |
1982 | - * | |
1983 | - * The various task state such as scheduling policy and priority may have | |
1984 | - * been inherited fro a user process, so we reset them to sane values here. | |
1985 | - * | |
1986 | - * NOTE that reparent_to_init() gives the caller full capabilities. | |
1987 | - */ | |
1988 | -void reparent_to_init(void) | |
1989 | +static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | |
1990 | { | |
1991 | - struct task_struct *this_task = current; | |
1992 | - | |
1993 | - write_lock_irq(&tasklist_lock); | |
1994 | - | |
1995 | - /* Reparent to init */ | |
1996 | - REMOVE_LINKS(this_task); | |
1997 | - this_task->p_pptr = child_reaper; | |
1998 | - this_task->p_opptr = child_reaper; | |
1999 | - SET_LINKS(this_task); | |
2000 | - | |
2001 | - /* Set the exit signal to SIGCHLD so we signal init on exit */ | |
2002 | - this_task->exit_signal = SIGCHLD; | |
2003 | - | |
2004 | - /* We also take the runqueue_lock while altering task fields | |
2005 | - * which affect scheduling decisions */ | |
2006 | - spin_lock(&runqueue_lock); | |
2007 | - | |
2008 | - this_task->ptrace = 0; | |
2009 | - this_task->nice = DEF_NICE; | |
2010 | - this_task->policy = SCHED_OTHER; | |
2011 | - /* cpus_allowed? */ | |
2012 | - /* rt_priority? */ | |
2013 | - /* signals? */ | |
2014 | - this_task->cap_effective = CAP_INIT_EFF_SET; | |
2015 | - this_task->cap_inheritable = CAP_INIT_INH_SET; | |
2016 | - this_task->cap_permitted = CAP_FULL_SET; | |
2017 | - this_task->keep_capabilities = 0; | |
2018 | - memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim))); | |
2019 | - this_task->user = INIT_USER; | |
2020 | - | |
2021 | - spin_unlock(&runqueue_lock); | |
2022 | - write_unlock_irq(&tasklist_lock); | |
2023 | + if (rq1 == rq2) | |
2024 | + spin_lock(&rq1->lock); | |
2025 | + else { | |
2026 | + if (rq1 < rq2) { | |
2027 | + spin_lock(&rq1->lock); | |
2028 | + spin_lock(&rq2->lock); | |
2029 | + } else { | |
2030 | + spin_lock(&rq2->lock); | |
2031 | + spin_lock(&rq1->lock); | |
2032 | + } | |
2033 | + } | |
2034 | } | |
2035 | ||
2036 | -/* | |
2037 | - * Put all the gunge required to become a kernel thread without | |
2038 | - * attached user resources in one place where it belongs. | |
2039 | - */ | |
2040 | - | |
2041 | -void daemonize(void) | |
2042 | +static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) | |
2043 | { | |
2044 | - struct fs_struct *fs; | |
2045 | - | |
2046 | - | |
2047 | - /* | |
2048 | - * If we were started as result of loading a module, close all of the | |
2049 | - * user space pages. We don't need them, and if we didn't close them | |
2050 | - * they would be locked into memory. | |
2051 | - */ | |
2052 | - exit_mm(current); | |
2053 | - | |
2054 | - current->session = 1; | |
2055 | - current->pgrp = 1; | |
2056 | - current->tty = NULL; | |
2057 | - | |
2058 | - /* Become as one with the init task */ | |
2059 | - | |
2060 | - exit_fs(current); /* current->fs->count--; */ | |
2061 | - fs = init_task.fs; | |
2062 | - current->fs = fs; | |
2063 | - atomic_inc(&fs->count); | |
2064 | - exit_files(current); | |
2065 | - current->files = init_task.files; | |
2066 | - atomic_inc(¤t->files->count); | |
2067 | + spin_unlock(&rq1->lock); | |
2068 | + if (rq1 != rq2) | |
2069 | + spin_unlock(&rq2->lock); | |
2070 | } | |
2071 | ||
2072 | -extern unsigned long wait_init_idle; | |
2073 | - | |
2074 | -void __init init_idle(void) | |
2075 | +void __init init_idle(task_t *idle, int cpu) | |
2076 | { | |
2077 | - struct schedule_data * sched_data; | |
2078 | - sched_data = &aligned_data[smp_processor_id()].schedule_data; | |
2079 | + runqueue_t *idle_rq = cpu_rq(cpu), *rq = idle->array->rq; | |
2080 | + unsigned long flags; | |
2081 | ||
2082 | - if (current != &init_task && task_on_runqueue(current)) { | |
2083 | - printk("UGH! (%d:%d) was on the runqueue, removing.\n", | |
2084 | - smp_processor_id(), current->pid); | |
2085 | - del_from_runqueue(current); | |
2086 | - } | |
2087 | - sched_data->curr = current; | |
2088 | - sched_data->last_schedule = get_cycles(); | |
2089 | - clear_bit(current->processor, &wait_init_idle); | |
2090 | + __save_flags(flags); | |
2091 | + __cli(); | |
2092 | + double_rq_lock(idle_rq, rq); | |
2093 | + | |
2094 | + idle_rq->curr = idle_rq->idle = idle; | |
2095 | + deactivate_task(idle, rq); | |
2096 | + idle->array = NULL; | |
2097 | + idle->prio = MAX_PRIO; | |
2098 | + idle->state = TASK_RUNNING; | |
2099 | + idle->cpu = cpu; | |
2100 | + double_rq_unlock(idle_rq, rq); | |
2101 | + idle->need_resched = 1; | |
2102 | + __restore_flags(flags); | |
2103 | } | |
2104 | ||
2105 | -extern void init_timervecs (void); | |
2106 | +extern void init_timervecs(void); | |
2107 | +extern void timer_bh(void); | |
2108 | +extern void tqueue_bh(void); | |
2109 | +extern void immediate_bh(void); | |
2110 | ||
2111 | void __init sched_init(void) | |
2112 | { | |
2113 | + runqueue_t *rq; | |
2114 | + int i, j, k; | |
2115 | + | |
2116 | + for (i = 0; i < NR_CPUS; i++) { | |
2117 | + runqueue_t *rq = cpu_rq(i); | |
2118 | + prio_array_t *array; | |
2119 | + | |
2120 | + rq->active = rq->arrays + 0; | |
2121 | + rq->expired = rq->arrays + 1; | |
2122 | + spin_lock_init(&rq->lock); | |
2123 | + | |
2124 | + for (j = 0; j < 2; j++) { | |
2125 | + array = rq->arrays + j; | |
2126 | + array->rq = rq; | |
2127 | + array->lock = &rq->lock; | |
2128 | + for (k = 0; k < MAX_PRIO; k++) { | |
2129 | + INIT_LIST_HEAD(array->queue + k); | |
2130 | + __clear_bit(k, array->bitmap); | |
2131 | + } | |
2132 | + // delimiter for bitsearch | |
2133 | + __set_bit(MAX_PRIO, array->bitmap); | |
2134 | + } | |
2135 | + } | |
2136 | /* | |
2137 | * We have to do a little magic to get the first | |
2138 | * process right in SMP mode. | |
2139 | */ | |
2140 | - int cpu = smp_processor_id(); | |
2141 | - int nr; | |
2142 | - | |
2143 | - init_task.processor = cpu; | |
2144 | - | |
2145 | - for(nr = 0; nr < PIDHASH_SZ; nr++) | |
2146 | - pidhash[nr] = NULL; | |
2147 | + rq = this_rq(); | |
2148 | + rq->curr = current; | |
2149 | + rq->idle = current; | |
2150 | + wake_up_process(current); | |
2151 | ||
2152 | init_timervecs(); | |
2153 | - | |
2154 | init_bh(TIMER_BH, timer_bh); | |
2155 | init_bh(TQUEUE_BH, tqueue_bh); | |
2156 | init_bh(IMMEDIATE_BH, immediate_bh); | |
2157 | @@ -1340,5 +1370,5 @@ | |
2158 | * The boot idle thread does lazy MMU switching as well: | |
2159 | */ | |
2160 | atomic_inc(&init_mm.mm_count); | |
2161 | - enter_lazy_tlb(&init_mm, current, cpu); | |
2162 | + enter_lazy_tlb(&init_mm, current, smp_processor_id()); | |
2163 | } | |
2164 | --- linux/kernel/exit.c.orig Sun Jan 6 13:55:56 2002 | |
2165 | +++ linux/kernel/exit.c Mon Jan 28 18:01:36 2002 | |
2166 | @@ -27,49 +27,42 @@ | |
2167 | ||
2168 | static void release_task(struct task_struct * p) | |
2169 | { | |
2170 | - if (p != current) { | |
2171 | + unsigned long flags; | |
2172 | + | |
2173 | + if (p == current) | |
2174 | + BUG(); | |
2175 | #ifdef CONFIG_SMP | |
2176 | - /* | |
2177 | - * Wait to make sure the process isn't on the | |
2178 | - * runqueue (active on some other CPU still) | |
2179 | - */ | |
2180 | - for (;;) { | |
2181 | - task_lock(p); | |
2182 | - if (!task_has_cpu(p)) | |
2183 | - break; | |
2184 | - task_unlock(p); | |
2185 | - do { | |
2186 | - cpu_relax(); | |
2187 | - barrier(); | |
2188 | - } while (task_has_cpu(p)); | |
2189 | - } | |
2190 | - task_unlock(p); | |
2191 | + wait_task_inactive(p); | |
2192 | #endif | |
2193 | - atomic_dec(&p->user->processes); | |
2194 | - free_uid(p->user); | |
2195 | - unhash_process(p); | |
2196 | - | |
2197 | - release_thread(p); | |
2198 | - current->cmin_flt += p->min_flt + p->cmin_flt; | |
2199 | - current->cmaj_flt += p->maj_flt + p->cmaj_flt; | |
2200 | - current->cnswap += p->nswap + p->cnswap; | |
2201 | - /* | |
2202 | - * Potentially available timeslices are retrieved | |
2203 | - * here - this way the parent does not get penalized | |
2204 | - * for creating too many processes. | |
2205 | - * | |
2206 | - * (this cannot be used to artificially 'generate' | |
2207 | - * timeslices, because any timeslice recovered here | |
2208 | - * was given away by the parent in the first place.) | |
2209 | - */ | |
2210 | - current->counter += p->counter; | |
2211 | - if (current->counter >= MAX_COUNTER) | |
2212 | - current->counter = MAX_COUNTER; | |
2213 | - p->pid = 0; | |
2214 | - free_task_struct(p); | |
2215 | - } else { | |
2216 | - printk("task releasing itself\n"); | |
2217 | - } | |
2218 | + atomic_dec(&p->user->processes); | |
2219 | + free_uid(p->user); | |
2220 | + unhash_process(p); | |
2221 | + | |
2222 | + release_thread(p); | |
2223 | + current->cmin_flt += p->min_flt + p->cmin_flt; | |
2224 | + current->cmaj_flt += p->maj_flt + p->cmaj_flt; | |
2225 | + current->cnswap += p->nswap + p->cnswap; | |
2226 | + /* | |
2227 | + * Potentially available timeslices are retrieved | |
2228 | + * here - this way the parent does not get penalized | |
2229 | + * for creating too many processes. | |
2230 | + * | |
2231 | + * (this cannot be used to artificially 'generate' | |
2232 | + * timeslices, because any timeslice recovered here | |
2233 | + * was given away by the parent in the first place.) | |
2234 | + */ | |
2235 | + __save_flags(flags); | |
2236 | + __cli(); | |
2237 | + current->time_slice += p->time_slice; | |
2238 | + if (current->time_slice > MAX_TIMESLICE) | |
2239 | + current->time_slice = MAX_TIMESLICE; | |
2240 | + if (p->sleep_avg < current->sleep_avg) | |
2241 | + current->sleep_avg = (current->sleep_avg * EXIT_WEIGHT + | |
2242 | + p->sleep_avg) / (EXIT_WEIGHT + 1); | |
2243 | + __restore_flags(flags); | |
2244 | + | |
2245 | + p->pid = 0; | |
2246 | + free_task_struct(p); | |
2247 | } | |
2248 | ||
2249 | /* | |
2250 | @@ -147,6 +140,79 @@ | |
2251 | } | |
2252 | read_unlock(&tasklist_lock); | |
2253 | return retval; | |
2254 | +} | |
2255 | + | |
2256 | +/** | |
2257 | + * reparent_to_init() - Reparent the calling kernel thread to the init task. | |
2258 | + * | |
2259 | + * If a kernel thread is launched as a result of a system call, or if | |
2260 | + * it ever exits, it should generally reparent itself to init so that | |
2261 | + * it is correctly cleaned up on exit. | |
2262 | + * | |
2263 | + * The various task state such as scheduling policy and priority may have | |
2264 | + * been inherited from a user process, so we reset them to sane values here. | |
2265 | + * | |
2266 | + * NOTE that reparent_to_init() gives the caller full capabilities. | |
2267 | + */ | |
2268 | +void reparent_to_init(void) | |
2269 | +{ | |
2270 | + write_lock_irq(&tasklist_lock); | |
2271 | + | |
2272 | + /* Reparent to init */ | |
2273 | + REMOVE_LINKS(current); | |
2274 | + current->p_pptr = child_reaper; | |
2275 | + current->p_opptr = child_reaper; | |
2276 | + SET_LINKS(current); | |
2277 | + | |
2278 | + /* Set the exit signal to SIGCHLD so we signal init on exit */ | |
2279 | + current->exit_signal = SIGCHLD; | |
2280 | + | |
2281 | + current->ptrace = 0; | |
2282 | + if ((current->policy == SCHED_OTHER) && (current->__nice < DEF_USER_NICE)) | |
2283 | + set_user_nice(current, DEF_USER_NICE); | |
2284 | + /* cpus_allowed? */ | |
2285 | + /* rt_priority? */ | |
2286 | + /* signals? */ | |
2287 | + current->cap_effective = CAP_INIT_EFF_SET; | |
2288 | + current->cap_inheritable = CAP_INIT_INH_SET; | |
2289 | + current->cap_permitted = CAP_FULL_SET; | |
2290 | + current->keep_capabilities = 0; | |
2291 | + memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim))); | |
2292 | + current->user = INIT_USER; | |
2293 | + | |
2294 | + write_unlock_irq(&tasklist_lock); | |
2295 | +} | |
2296 | + | |
2297 | +/* | |
2298 | + * Put all the gunge required to become a kernel thread without | |
2299 | + * attached user resources in one place where it belongs. | |
2300 | + */ | |
2301 | + | |
2302 | +void daemonize(void) | |
2303 | +{ | |
2304 | + struct fs_struct *fs; | |
2305 | + | |
2306 | + | |
2307 | + /* | |
2308 | + * If we were started as result of loading a module, close all of the | |
2309 | + * user space pages. We don't need them, and if we didn't close them | |
2310 | + * they would be locked into memory. | |
2311 | + */ | |
2312 | + exit_mm(current); | |
2313 | + | |
2314 | + current->session = 1; | |
2315 | + current->pgrp = 1; | |
2316 | + current->tty = NULL; | |
2317 | + | |
2318 | + /* Become as one with the init task */ | |
2319 | + | |
2320 | + exit_fs(current); /* current->fs->count--; */ | |
2321 | + fs = init_task.fs; | |
2322 | + current->fs = fs; | |
2323 | + atomic_inc(&fs->count); | |
2324 | + exit_files(current); | |
2325 | + current->files = init_task.files; | |
2326 | + atomic_inc(¤t->files->count); | |
2327 | } | |
2328 | ||
2329 | /* | |
2330 | --- linux/kernel/capability.c.orig Sat Jun 24 06:06:37 2000 | |
2331 | +++ linux/kernel/capability.c Sun Jan 6 13:56:25 2002 | |
2332 | @@ -8,6 +8,8 @@ | |
2333 | #include <linux/mm.h> | |
2334 | #include <asm/uaccess.h> | |
2335 | ||
2336 | +unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ | |
2337 | + | |
2338 | kernel_cap_t cap_bset = CAP_INIT_EFF_SET; | |
2339 | ||
2340 | /* Note: never hold tasklist_lock while spinning for this one */ | |
2341 | --- linux/kernel/timer.c.orig Sun Jan 6 13:55:49 2002 | |
2342 | +++ linux/kernel/timer.c Mon Jan 21 12:53:05 2002 | |
2343 | @@ -25,6 +25,8 @@ | |
2344 | ||
2345 | #include <asm/uaccess.h> | |
2346 | ||
2347 | +struct kernel_stat kstat; | |
2348 | + | |
2349 | /* | |
2350 | * Timekeeping variables | |
2351 | */ | |
2352 | @@ -583,17 +585,16 @@ | |
2353 | ||
2354 | update_one_process(p, user_tick, system, cpu); | |
2355 | if (p->pid) { | |
2356 | - if (--p->counter <= 0) { | |
2357 | - p->counter = 0; | |
2358 | - p->need_resched = 1; | |
2359 | - } | |
2360 | - if (p->nice > 0) | |
2361 | + if (p->__nice > 0) | |
2362 | kstat.per_cpu_nice[cpu] += user_tick; | |
2363 | else | |
2364 | kstat.per_cpu_user[cpu] += user_tick; | |
2365 | kstat.per_cpu_system[cpu] += system; | |
2366 | - } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1) | |
2367 | - kstat.per_cpu_system[cpu] += system; | |
2368 | + } else { | |
2369 | + if (local_bh_count(cpu) || local_irq_count(cpu) > 1) | |
2370 | + kstat.per_cpu_system[cpu] += system; | |
2371 | + } | |
2372 | + scheduler_tick(p); | |
2373 | } | |
2374 | ||
2375 | /* | |
2376 | @@ -794,6 +795,89 @@ | |
2377 | ||
2378 | #endif | |
2379 | ||
2380 | +static void process_timeout(unsigned long __data) | |
2381 | +{ | |
2382 | + wake_up_process((task_t *)__data); | |
2383 | +} | |
2384 | + | |
2385 | +/** | |
2386 | + * schedule_timeout - sleep until timeout | |
2387 | + * @timeout: timeout value in jiffies | |
2388 | + * | |
2389 | + * Make the current task sleep until @timeout jiffies have | |
2390 | + * elapsed. The routine will return immediately unless | |
2391 | + * the current task state has been set (see set_current_state()). | |
2392 | + * | |
2393 | + * You can set the task state as follows - | |
2394 | + * | |
2395 | + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to | |
2396 | + * pass before the routine returns. The routine will return 0 | |
2397 | + * | |
2398 | + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is | |
2399 | + * delivered to the current task. In this case the remaining time | |
2400 | + * in jiffies will be returned, or 0 if the timer expired in time | |
2401 | + * | |
2402 | + * The current task state is guaranteed to be TASK_RUNNING when this | |
2403 | + * routine returns. | |
2404 | + * | |
2405 | + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule | |
2406 | + * the CPU away without a bound on the timeout. In this case the return | |
2407 | + * value will be %MAX_SCHEDULE_TIMEOUT. | |
2408 | + * | |
2409 | + * In all cases the return value is guaranteed to be non-negative. | |
2410 | + */ | |
2411 | +signed long schedule_timeout(signed long timeout) | |
2412 | +{ | |
2413 | + struct timer_list timer; | |
2414 | + unsigned long expire; | |
2415 | + | |
2416 | + switch (timeout) | |
2417 | + { | |
2418 | + case MAX_SCHEDULE_TIMEOUT: | |
2419 | + /* | |
2420 | + * These two special cases are useful to be comfortable | |
2421 | + * in the caller. Nothing more. We could take | |
2422 | + * MAX_SCHEDULE_TIMEOUT from one of the negative value | |
2423 | + * but I' d like to return a valid offset (>=0) to allow | |
2424 | + * the caller to do everything it want with the retval. | |
2425 | + */ | |
2426 | + schedule(); | |
2427 | + goto out; | |
2428 | + default: | |
2429 | + /* | |
2430 | + * Another bit of PARANOID. Note that the retval will be | |
2431 | + * 0 since no piece of kernel is supposed to do a check | |
2432 | + * for a negative retval of schedule_timeout() (since it | |
2433 | + * should never happens anyway). You just have the printk() | |
2434 | + * that will tell you if something is gone wrong and where. | |
2435 | + */ | |
2436 | + if (timeout < 0) | |
2437 | + { | |
2438 | + printk(KERN_ERR "schedule_timeout: wrong timeout " | |
2439 | + "value %lx from %p\n", timeout, | |
2440 | + __builtin_return_address(0)); | |
2441 | + current->state = TASK_RUNNING; | |
2442 | + goto out; | |
2443 | + } | |
2444 | + } | |
2445 | + | |
2446 | + expire = timeout + jiffies; | |
2447 | + | |
2448 | + init_timer(&timer); | |
2449 | + timer.expires = expire; | |
2450 | + timer.data = (unsigned long) current; | |
2451 | + timer.function = process_timeout; | |
2452 | + | |
2453 | + add_timer(&timer); | |
2454 | + schedule(); | |
2455 | + del_timer_sync(&timer); | |
2456 | + | |
2457 | + timeout = expire - jiffies; | |
2458 | + | |
2459 | + out: | |
2460 | + return timeout < 0 ? 0 : timeout; | |
2461 | +} | |
2462 | + | |
2463 | /* Thread ID - the internal kernel "pid" */ | |
2464 | asmlinkage long sys_gettid(void) | |
2465 | { | |
2466 | @@ -840,4 +924,3 @@ | |
2467 | } | |
2468 | return 0; | |
2469 | } | |
2470 | - | |
2471 | --- linux/kernel/fork.c.orig Sun Jan 6 13:55:56 2002 | |
2472 | +++ linux/kernel/fork.c Thu Jan 24 13:45:09 2002 | |
2473 | @@ -28,7 +28,6 @@ | |
2474 | ||
2475 | /* The idle threads do not count.. */ | |
2476 | int nr_threads; | |
2477 | -int nr_running; | |
2478 | ||
2479 | int max_threads; | |
2480 | unsigned long total_forks; /* Handle normal Linux uptimes. */ | |
2481 | @@ -36,6 +35,8 @@ | |
2482 | ||
2483 | struct task_struct *pidhash[PIDHASH_SZ]; | |
2484 | ||
2485 | +rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ | |
2486 | + | |
2487 | void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait) | |
2488 | { | |
2489 | unsigned long flags; | |
2490 | @@ -563,6 +564,7 @@ | |
2491 | struct pt_regs *regs, unsigned long stack_size) | |
2492 | { | |
2493 | int retval; | |
2494 | + unsigned long flags; | |
2495 | struct task_struct *p; | |
2496 | struct completion vfork; | |
2497 | ||
2498 | @@ -611,8 +613,7 @@ | |
2499 | copy_flags(clone_flags, p); | |
2500 | p->pid = get_pid(clone_flags); | |
2501 | ||
2502 | - p->run_list.next = NULL; | |
2503 | - p->run_list.prev = NULL; | |
2504 | + INIT_LIST_HEAD(&p->run_list); | |
2505 | ||
2506 | p->p_cptr = NULL; | |
2507 | init_waitqueue_head(&p->wait_chldexit); | |
2508 | @@ -638,14 +639,15 @@ | |
2509 | #ifdef CONFIG_SMP | |
2510 | { | |
2511 | int i; | |
2512 | - p->cpus_runnable = ~0UL; | |
2513 | - p->processor = current->processor; | |
2514 | + | |
2515 | /* ?? should we just memset this ?? */ | |
2516 | for(i = 0; i < smp_num_cpus; i++) | |
2517 | - p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0; | |
2518 | + p->per_cpu_utime[cpu_logical_map(i)] = | |
2519 | + p->per_cpu_stime[cpu_logical_map(i)] = 0; | |
2520 | spin_lock_init(&p->sigmask_lock); | |
2521 | } | |
2522 | #endif | |
2523 | + p->array = NULL; | |
2524 | p->lock_depth = -1; /* -1 = no lock */ | |
2525 | p->start_time = jiffies; | |
2526 | ||
2527 | @@ -677,15 +679,27 @@ | |
2528 | p->pdeath_signal = 0; | |
2529 | ||
2530 | /* | |
2531 | - * "share" dynamic priority between parent and child, thus the | |
2532 | - * total amount of dynamic priorities in the system doesnt change, | |
2533 | - * more scheduling fairness. This is only important in the first | |
2534 | - * timeslice, on the long run the scheduling behaviour is unchanged. | |
2535 | + * Share the timeslice between parent and child, thus the | |
2536 | + * total amount of pending timeslices in the system doesnt change, | |
2537 | + * resulting in more scheduling fairness. | |
2538 | */ | |
2539 | - p->counter = (current->counter + 1) >> 1; | |
2540 | - current->counter >>= 1; | |
2541 | - if (!current->counter) | |
2542 | - current->need_resched = 1; | |
2543 | + __save_flags(flags); | |
2544 | + __cli(); | |
2545 | + if (!current->time_slice) | |
2546 | + BUG(); | |
2547 | + p->time_slice = (current->time_slice + 1) >> 1; | |
2548 | + current->time_slice >>= 1; | |
2549 | + if (!current->time_slice) { | |
2550 | + /* | |
2551 | + * This case is rare, it happens when the parent has only | |
2552 | + * a single jiffy left from its timeslice. Taking the | |
2553 | + * runqueue lock is not a problem. | |
2554 | + */ | |
2555 | + current->time_slice = 1; | |
2556 | + scheduler_tick(current); | |
2557 | + } | |
2558 | + p->sleep_timestamp = jiffies; | |
2559 | + __restore_flags(flags); | |
2560 | ||
2561 | /* | |
2562 | * Ok, add it to the run-queues and make it | |
2563 | @@ -722,10 +736,23 @@ | |
2564 | if (p->ptrace & PT_PTRACED) | |
2565 | send_sig(SIGSTOP, p, 1); | |
2566 | ||
2567 | +#define RUN_CHILD_FIRST 1 | |
2568 | +#if RUN_CHILD_FIRST | |
2569 | + wake_up_forked_process(p); /* do this last */ | |
2570 | +#else | |
2571 | wake_up_process(p); /* do this last */ | |
2572 | +#endif | |
2573 | ++total_forks; | |
2574 | if (clone_flags & CLONE_VFORK) | |
2575 | wait_for_completion(&vfork); | |
2576 | +#if RUN_CHILD_FIRST | |
2577 | + else | |
2578 | + /* | |
2579 | + * Let the child process run first, to avoid most of the | |
2580 | + * COW overhead when the child exec()s afterwards. | |
2581 | + */ | |
2582 | + current->need_resched = 1; | |
2583 | +#endif | |
2584 | ||
2585 | fork_out: | |
2586 | return retval; | |
2587 | --- linux/kernel/softirq.c.orig Sun Jan 6 13:55:53 2002 | |
2588 | +++ linux/kernel/softirq.c Wed Jan 16 00:52:11 2002 | |
2589 | @@ -259,10 +259,9 @@ | |
2590 | ||
2591 | while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { | |
2592 | current->state = TASK_RUNNING; | |
2593 | - do { | |
2594 | - current->policy |= SCHED_YIELD; | |
2595 | - schedule(); | |
2596 | - } while (test_bit(TASKLET_STATE_SCHED, &t->state)); | |
2597 | + do | |
2598 | + sys_sched_yield(); | |
2599 | + while (test_bit(TASKLET_STATE_SCHED, &t->state)); | |
2600 | } | |
2601 | tasklet_unlock_wait(t); | |
2602 | clear_bit(TASKLET_STATE_SCHED, &t->state); | |
2603 | @@ -365,13 +364,13 @@ | |
2604 | int cpu = cpu_logical_map(bind_cpu); | |
2605 | ||
2606 | daemonize(); | |
2607 | - current->nice = 19; | |
2608 | + set_user_nice(current, 19); | |
2609 | sigfillset(¤t->blocked); | |
2610 | ||
2611 | /* Migrate to the right CPU */ | |
2612 | - current->cpus_allowed = 1UL << cpu; | |
2613 | - while (smp_processor_id() != cpu) | |
2614 | - schedule(); | |
2615 | + set_cpus_allowed(current, 1UL << cpu); | |
2616 | + if (cpu() != cpu) | |
2617 | + BUG(); | |
2618 | ||
2619 | sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu); | |
2620 | ||
2621 | @@ -396,7 +395,7 @@ | |
2622 | } | |
2623 | } | |
2624 | ||
2625 | -static __init int spawn_ksoftirqd(void) | |
2626 | +__init int spawn_ksoftirqd(void) | |
2627 | { | |
2628 | int cpu; | |
2629 | ||
2630 | @@ -405,14 +404,12 @@ | |
2631 | CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0) | |
2632 | printk("spawn_ksoftirqd() failed for cpu %d\n", cpu); | |
2633 | else { | |
2634 | - while (!ksoftirqd_task(cpu_logical_map(cpu))) { | |
2635 | - current->policy |= SCHED_YIELD; | |
2636 | - schedule(); | |
2637 | - } | |
2638 | + while (!ksoftirqd_task(cpu_logical_map(cpu))) | |
2639 | + sys_sched_yield(); | |
2640 | } | |
2641 | } | |
2642 | ||
2643 | return 0; | |
2644 | } | |
2645 | ||
2646 | -__initcall(spawn_ksoftirqd); | |
2647 | +__initcall(spawn_ksoftirqd); | |
2648 | --- linux/kernel/ptrace.c.orig Sun Jan 6 13:55:57 2002 | |
2649 | +++ linux/kernel/ptrace.c Sun Jan 6 13:56:25 2002 | |
2650 | @@ -31,20 +31,7 @@ | |
2651 | if (child->state != TASK_STOPPED) | |
2652 | return -ESRCH; | |
2653 | #ifdef CONFIG_SMP | |
2654 | - /* Make sure the child gets off its CPU.. */ | |
2655 | - for (;;) { | |
2656 | - task_lock(child); | |
2657 | - if (!task_has_cpu(child)) | |
2658 | - break; | |
2659 | - task_unlock(child); | |
2660 | - do { | |
2661 | - if (child->state != TASK_STOPPED) | |
2662 | - return -ESRCH; | |
2663 | - barrier(); | |
2664 | - cpu_relax(); | |
2665 | - } while (task_has_cpu(child)); | |
2666 | - } | |
2667 | - task_unlock(child); | |
2668 | + wait_task_inactive(child); | |
2669 | #endif | |
2670 | } | |
2671 | ||
2672 | --- linux/kernel/sys.c.orig Sun Jan 6 13:55:47 2002 | |
2673 | +++ linux/kernel/sys.c Sun Jan 6 13:56:25 2002 | |
2674 | @@ -220,10 +220,10 @@ | |
2675 | } | |
2676 | if (error == -ESRCH) | |
2677 | error = 0; | |
2678 | - if (niceval < p->nice && !capable(CAP_SYS_NICE)) | |
2679 | + if (niceval < p->__nice && !capable(CAP_SYS_NICE)) | |
2680 | error = -EACCES; | |
2681 | else | |
2682 | - p->nice = niceval; | |
2683 | + set_user_nice(p, niceval); | |
2684 | } | |
2685 | read_unlock(&tasklist_lock); | |
2686 | ||
2687 | @@ -249,7 +249,7 @@ | |
2688 | long niceval; | |
2689 | if (!proc_sel(p, which, who)) | |
2690 | continue; | |
2691 | - niceval = 20 - p->nice; | |
2692 | + niceval = 20 - p->__nice; | |
2693 | if (niceval > retval) | |
2694 | retval = niceval; | |
2695 | } | |
2696 | --- linux/kernel/signal.c.orig Sun Jan 6 13:55:56 2002 | |
2697 | +++ linux/kernel/signal.c Sun Jan 6 13:56:25 2002 | |
2698 | @@ -478,12 +478,9 @@ | |
2699 | * process of changing - but no harm is done by that | |
2700 | * other than doing an extra (lightweight) IPI interrupt. | |
2701 | */ | |
2702 | - spin_lock(&runqueue_lock); | |
2703 | - if (task_has_cpu(t) && t->processor != smp_processor_id()) | |
2704 | - smp_send_reschedule(t->processor); | |
2705 | - spin_unlock(&runqueue_lock); | |
2706 | -#endif /* CONFIG_SMP */ | |
2707 | - | |
2708 | + if ((t->state == TASK_RUNNING) && (t->cpu != cpu())) | |
2709 | + kick_if_running(t); | |
2710 | +#endif | |
2711 | if (t->state & TASK_INTERRUPTIBLE) { | |
2712 | wake_up_process(t); | |
2713 | return; | |
2714 | --- linux/kernel/printk.c.orig Sun Jan 6 13:55:57 2002 | |
2715 | +++ linux/kernel/printk.c Sun Jan 6 13:56:25 2002 | |
2716 | @@ -25,6 +25,7 @@ | |
2717 | #include <linux/module.h> | |
2718 | #include <linux/interrupt.h> /* For in_interrupt() */ | |
2719 | #include <linux/config.h> | |
2720 | +#include <linux/delay.h> | |
2721 | ||
2722 | #include <asm/uaccess.h> | |
2723 | ||
2724 | --- linux/kernel/ksyms.c.orig Sun Jan 6 13:55:57 2002 | |
2725 | +++ linux/kernel/ksyms.c Thu Jan 10 22:55:43 2002 | |
2726 | @@ -437,6 +437,9 @@ | |
2727 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); | |
2728 | EXPORT_SYMBOL(schedule); | |
2729 | EXPORT_SYMBOL(schedule_timeout); | |
2730 | +EXPORT_SYMBOL(sys_sched_yield); | |
2731 | +EXPORT_SYMBOL(set_user_nice); | |
2732 | +EXPORT_SYMBOL(set_cpus_allowed); | |
2733 | EXPORT_SYMBOL(jiffies); | |
2734 | EXPORT_SYMBOL(xtime); | |
2735 | EXPORT_SYMBOL(do_gettimeofday); | |
2736 | @@ -448,6 +451,7 @@ | |
2737 | ||
2738 | EXPORT_SYMBOL(kstat); | |
2739 | EXPORT_SYMBOL(nr_running); | |
2740 | +EXPORT_SYMBOL(nr_context_switches); | |
2741 | ||
2742 | /* misc */ | |
2743 | EXPORT_SYMBOL(panic); | |
2744 | --- linux/mm/oom_kill.c.orig Sun Jan 6 13:55:53 2002 | |
2745 | +++ linux/mm/oom_kill.c Sun Jan 6 13:56:25 2002 | |
2746 | @@ -82,7 +82,7 @@ | |
2747 | * Niced processes are most likely less important, so double | |
2748 | * their badness points. | |
2749 | */ | |
2750 | - if (p->nice > 0) | |
2751 | + if (p->__nice > 0) | |
2752 | points *= 2; | |
2753 | ||
2754 | /* | |
2755 | @@ -149,7 +149,7 @@ | |
2756 | * all the memory it needs. That way it should be able to | |
2757 | * exit() and clear out its resources quickly... | |
2758 | */ | |
2759 | - p->counter = 5 * HZ; | |
2760 | + p->time_slice = 2 * MAX_TIMESLICE; | |
2761 | p->flags |= PF_MEMALLOC | PF_MEMDIE; | |
2762 | ||
2763 | /* This process has hardware access, be more careful. */ | |
2764 | @@ -188,8 +188,7 @@ | |
2765 | * killing itself before someone else gets the chance to ask | |
2766 | * for more memory. | |
2767 | */ | |
2768 | - current->policy |= SCHED_YIELD; | |
2769 | - schedule(); | |
2770 | + yield(); | |
2771 | return; | |
2772 | } | |
2773 | ||
2774 | --- linux/mm/page_alloc.c.orig Sun Jan 6 13:55:56 2002 | |
2775 | +++ linux/mm/page_alloc.c Fri Jan 25 14:26:36 2002 | |
2776 | @@ -394,9 +394,8 @@ | |
2777 | return NULL; | |
2778 | ||
2779 | /* Yield for kswapd, and try again */ | |
2780 | - current->policy |= SCHED_YIELD; | |
2781 | __set_current_state(TASK_RUNNING); | |
2782 | - schedule(); | |
2783 | + yield(); | |
2784 | goto rebalance; | |
2785 | } | |
2786 | ||
2787 | --- linux/mm/highmem.c.orig Sun Jan 6 13:55:57 2002 | |
2788 | +++ linux/mm/highmem.c Fri Jan 25 14:26:56 2002 | |
2789 | @@ -354,9 +354,8 @@ | |
2790 | /* we need to wait I/O completion */ | |
2791 | run_task_queue(&tq_disk); | |
2792 | ||
2793 | - current->policy |= SCHED_YIELD; | |
2794 | __set_current_state(TASK_RUNNING); | |
2795 | - schedule(); | |
2796 | + yield(); | |
2797 | goto repeat_alloc; | |
2798 | } | |
2799 | ||
2800 | @@ -392,9 +391,8 @@ | |
2801 | /* we need to wait I/O completion */ | |
2802 | run_task_queue(&tq_disk); | |
2803 | ||
2804 | - current->policy |= SCHED_YIELD; | |
2805 | __set_current_state(TASK_RUNNING); | |
2806 | - schedule(); | |
2807 | + yield(); | |
2808 | goto repeat_alloc; | |
2809 | } | |
2810 | ||
2811 | --- linux/include/linux/sched.h.orig Sun Jan 6 13:55:57 2002 | |
2812 | +++ linux/include/linux/sched.h Mon Jan 28 18:48:01 2002 | |
2813 | @@ -6,6 +6,7 @@ | |
2814 | extern unsigned long event; | |
2815 | ||
2816 | #include <linux/config.h> | |
2817 | +#include <linux/compiler.h> | |
2818 | #include <linux/binfmts.h> | |
2819 | #include <linux/threads.h> | |
2820 | #include <linux/kernel.h> | |
2821 | @@ -42,6 +43,7 @@ | |
2822 | #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ | |
2823 | #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ | |
2824 | #define CLONE_THREAD 0x00010000 /* Same thread group? */ | |
2825 | +#define CLONE_NEWNS 0x00020000 /* New namespace group? */ | |
2826 | ||
2827 | #define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD) | |
2828 | ||
2829 | @@ -72,8 +74,9 @@ | |
2830 | #define CT_TO_SECS(x) ((x) / HZ) | |
2831 | #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) | |
2832 | ||
2833 | -extern int nr_running, nr_threads; | |
2834 | +extern int nr_threads; | |
2835 | extern int last_pid; | |
2836 | +extern unsigned long nr_running(void); | |
2837 | ||
2838 | #include <linux/fs.h> | |
2839 | #include <linux/time.h> | |
2840 | @@ -116,12 +119,6 @@ | |
2841 | #define SCHED_FIFO 1 | |
2842 | #define SCHED_RR 2 | |
2843 | ||
2844 | -/* | |
2845 | - * This is an additional bit set when we want to | |
2846 | - * yield the CPU for one re-schedule.. | |
2847 | - */ | |
2848 | -#define SCHED_YIELD 0x10 | |
2849 | - | |
2850 | struct sched_param { | |
2851 | int sched_priority; | |
2852 | }; | |
2853 | @@ -139,17 +136,22 @@ | |
2854 | * a separate lock). | |
2855 | */ | |
2856 | extern rwlock_t tasklist_lock; | |
2857 | -extern spinlock_t runqueue_lock; | |
2858 | extern spinlock_t mmlist_lock; | |
2859 | ||
2860 | +typedef struct task_struct task_t; | |
2861 | + | |
2862 | extern void sched_init(void); | |
2863 | -extern void init_idle(void); | |
2864 | +extern void init_idle(task_t *idle, int cpu); | |
2865 | extern void show_state(void); | |
2866 | extern void cpu_init (void); | |
2867 | extern void trap_init(void); | |
2868 | extern void update_process_times(int user); | |
2869 | -extern void update_one_process(struct task_struct *p, unsigned long user, | |
2870 | +extern void update_one_process(task_t *p, unsigned long user, | |
2871 | unsigned long system, int cpu); | |
2872 | +extern void scheduler_tick(task_t *p); | |
2873 | +extern void sched_task_migrated(task_t *p); | |
2874 | +extern void smp_migrate_task(int cpu, task_t *task); | |
2875 | +extern unsigned long cache_decay_ticks; | |
2876 | ||
2877 | #define MAX_SCHEDULE_TIMEOUT LONG_MAX | |
2878 | extern signed long FASTCALL(schedule_timeout(signed long timeout)); | |
2879 | @@ -166,6 +168,7 @@ | |
2880 | */ | |
2881 | #define NR_OPEN_DEFAULT BITS_PER_LONG | |
2882 | ||
2883 | +struct namespace; | |
2884 | /* | |
2885 | * Open file table structure | |
2886 | */ | |
2887 | @@ -278,6 +281,8 @@ | |
2888 | extern struct user_struct root_user; | |
2889 | #define INIT_USER (&root_user) | |
2890 | ||
2891 | +typedef struct prio_array prio_array_t; | |
2892 | + | |
2893 | struct task_struct { | |
2894 | /* | |
2895 | * offsets of these are hardcoded elsewhere - touch with care | |
2896 | @@ -295,35 +300,28 @@ | |
2897 | ||
2898 | int lock_depth; /* Lock depth */ | |
2899 | ||
2900 | -/* | |
2901 | - * offset 32 begins here on 32-bit platforms. We keep | |
2902 | - * all fields in a single cacheline that are needed for | |
2903 | - * the goodness() loop in schedule(). | |
2904 | - */ | |
2905 | - long counter; | |
2906 | - long nice; | |
2907 | - unsigned long policy; | |
2908 | - struct mm_struct *mm; | |
2909 | - int processor; | |
2910 | /* | |
2911 | - * cpus_runnable is ~0 if the process is not running on any | |
2912 | - * CPU. It's (1 << cpu) if it's running on a CPU. This mask | |
2913 | - * is updated under the runqueue lock. | |
2914 | - * | |
2915 | - * To determine whether a process might run on a CPU, this | |
2916 | - * mask is AND-ed with cpus_allowed. | |
2917 | + * offset 32 begins here on 32-bit platforms. | |
2918 | */ | |
2919 | - unsigned long cpus_runnable, cpus_allowed; | |
2920 | - /* | |
2921 | - * (only the 'next' pointer fits into the cacheline, but | |
2922 | - * that's just fine.) | |
2923 | - */ | |
2924 | - struct list_head run_list; | |
2925 | - unsigned long sleep_time; | |
2926 | + unsigned int cpu; | |
2927 | + int prio; | |
2928 | + long __nice; | |
2929 | + list_t run_list; | |
2930 | + prio_array_t *array; | |
2931 | + | |
2932 | + unsigned int time_slice; | |
2933 | ||
2934 | - struct task_struct *next_task, *prev_task; | |
2935 | - struct mm_struct *active_mm; | |
2936 | + unsigned long sleep_avg; | |
2937 | + unsigned long sleep_timestamp; | |
2938 | + | |
2939 | + unsigned long policy; | |
2940 | + unsigned long cpus_allowed; | |
2941 | + | |
2942 | + task_t *next_task, *prev_task; | |
2943 | + | |
2944 | + struct mm_struct *mm, *active_mm; | |
2945 | struct list_head local_pages; | |
2946 | + | |
2947 | unsigned int allocation_order, nr_local_pages; | |
2948 | ||
2949 | /* task state */ | |
2950 | @@ -345,12 +343,12 @@ | |
2951 | * older sibling, respectively. (p->father can be replaced with | |
2952 | * p->p_pptr->pid) | |
2953 | */ | |
2954 | - struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr; | |
2955 | + task_t *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr; | |
2956 | struct list_head thread_group; | |
2957 | ||
2958 | /* PID hash table linkage. */ | |
2959 | - struct task_struct *pidhash_next; | |
2960 | - struct task_struct **pidhash_pprev; | |
2961 | + task_t *pidhash_next; | |
2962 | + task_t **pidhash_pprev; | |
2963 | ||
2964 | wait_queue_head_t wait_chldexit; /* for wait4() */ | |
2965 | struct completion *vfork_done; /* for vfork() */ | |
2966 | @@ -389,6 +387,8 @@ | |
2967 | struct fs_struct *fs; | |
2968 | /* open file information */ | |
2969 | struct files_struct *files; | |
2970 | +/* namespace */ | |
2971 | + struct namespace *namespace; | |
2972 | /* signal handlers */ | |
2973 | spinlock_t sigmask_lock; /* Protects signal and blocked */ | |
2974 | struct signal_struct *sig; | |
2975 | @@ -446,10 +446,66 @@ | |
2976 | */ | |
2977 | #define _STK_LIM (8*1024*1024) | |
2978 | ||
2979 | -#define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */ | |
2980 | -#define MAX_COUNTER (20*HZ/100) | |
2981 | -#define DEF_NICE (0) | |
2982 | +/* | |
2983 | + * RT priorites go from 0 to 99, but internally we max | |
2984 | + * them out at 128 to make it easier to search the | |
2985 | + * scheduler bitmap. | |
2986 | + */ | |
2987 | +#define MAX_RT_PRIO 128 | |
2988 | +/* | |
2989 | + * The lower the priority of a process, the more likely it is | |
2990 | + * to run. Priority of a process goes from 0 to 167. The 0-99 | |
2991 | + * priority range is allocated to RT tasks, the 128-167 range | |
2992 | + * is for SCHED_OTHER tasks. | |
2993 | + */ | |
2994 | +#define MAX_PRIO (MAX_RT_PRIO + 40) | |
2995 | + | |
2996 | +/* | |
2997 | + * Scales user-nice values [ -20 ... 0 ... 19 ] | |
2998 | + * to static priority [ 128 ... 167 (MAX_PRIO-1) ] | |
2999 | + * | |
3000 | + * User-nice value of -20 == static priority 128, and | |
3001 | + * user-nice value 19 == static priority 167. The lower | |
3002 | + * the priority value, the higher the task's priority. | |
3003 | + */ | |
3004 | +#define NICE_TO_PRIO(n) (MAX_RT_PRIO + (n) + 20) | |
3005 | +#define DEF_USER_NICE 0 | |
3006 | + | |
3007 | +/* | |
3008 | + * Default timeslice is 250 msecs, maximum is 500 msecs. | |
3009 | + * Minimum timeslice is 10 msecs. | |
3010 | + */ | |
3011 | +#define MIN_TIMESLICE ( 10 * HZ / 1000) | |
3012 | +#define MAX_TIMESLICE (300 * HZ / 1000) | |
3013 | +#define CHILD_FORK_PENALTY 95 | |
3014 | +#define PARENT_FORK_PENALTY 100 | |
3015 | +#define EXIT_WEIGHT 3 | |
3016 | +#define PRIO_INTERACTIVE_RATIO 20 | |
3017 | +#define PRIO_CPU_HOG_RATIO 60 | |
3018 | +#define PRIO_BONUS_RATIO 70 | |
3019 | +#define INTERACTIVE_DELTA 3 | |
3020 | +#define MAX_SLEEP_AVG (2*HZ) | |
3021 | +#define STARVATION_LIMIT (2*HZ) | |
3022 | + | |
3023 | +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) | |
3024 | +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | |
3025 | + | |
3026 | +/* | |
3027 | + * NICE_TO_TIMESLICE scales nice values [ -20 ... 19 ] | |
3028 | + * to time slice values. | |
3029 | + * | |
3030 | + * The higher a process's priority, the bigger timeslices | |
3031 | + * it gets during one round of execution. But even the lowest | |
3032 | + * priority process gets MIN_TIMESLICE worth of execution time. | |
3033 | + */ | |
3034 | ||
3035 | +#define NICE_TO_TIMESLICE(n) (MIN_TIMESLICE + \ | |
3036 | + ((MAX_TIMESLICE - MIN_TIMESLICE) * (19-(n))) / 39) | |
3037 | + | |
3038 | +extern void set_cpus_allowed(task_t *p, unsigned long new_mask); | |
3039 | +extern void set_user_nice(task_t *p, long nice); | |
3040 | +asmlinkage long sys_sched_yield(void); | |
3041 | +#define yield() sys_sched_yield() | |
3042 | ||
3043 | /* | |
3044 | * The default (Linux) execution domain. | |
3045 | @@ -468,14 +524,13 @@ | |
3046 | addr_limit: KERNEL_DS, \ | |
3047 | exec_domain: &default_exec_domain, \ | |
3048 | lock_depth: -1, \ | |
3049 | - counter: DEF_COUNTER, \ | |
3050 | - nice: DEF_NICE, \ | |
3051 | + __nice: DEF_USER_NICE, \ | |
3052 | policy: SCHED_OTHER, \ | |
3053 | + cpus_allowed: -1, \ | |
3054 | mm: NULL, \ | |
3055 | active_mm: &init_mm, \ | |
3056 | - cpus_runnable: -1, \ | |
3057 | - cpus_allowed: -1, \ | |
3058 | run_list: LIST_HEAD_INIT(tsk.run_list), \ | |
3059 | + time_slice: HZ, \ | |
3060 | next_task: &tsk, \ | |
3061 | prev_task: &tsk, \ | |
3062 | p_opptr: &tsk, \ | |
3063 | @@ -509,24 +564,24 @@ | |
3064 | #endif | |
3065 | ||
3066 | union task_union { | |
3067 | - struct task_struct task; | |
3068 | + task_t task; | |
3069 | unsigned long stack[INIT_TASK_SIZE/sizeof(long)]; | |
3070 | }; | |
3071 | ||
3072 | extern union task_union init_task_union; | |
3073 | ||
3074 | extern struct mm_struct init_mm; | |
3075 | -extern struct task_struct *init_tasks[NR_CPUS]; | |
3076 | +extern task_t *init_tasks[NR_CPUS]; | |
3077 | ||
3078 | /* PID hashing. (shouldnt this be dynamic?) */ | |
3079 | #define PIDHASH_SZ (4096 >> 2) | |
3080 | -extern struct task_struct *pidhash[PIDHASH_SZ]; | |
3081 | +extern task_t *pidhash[PIDHASH_SZ]; | |
3082 | ||
3083 | #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1)) | |
3084 | ||
3085 | -static inline void hash_pid(struct task_struct *p) | |
3086 | +static inline void hash_pid(task_t *p) | |
3087 | { | |
3088 | - struct task_struct **htable = &pidhash[pid_hashfn(p->pid)]; | |
3089 | + task_t **htable = &pidhash[pid_hashfn(p->pid)]; | |
3090 | ||
3091 | if((p->pidhash_next = *htable) != NULL) | |
3092 | (*htable)->pidhash_pprev = &p->pidhash_next; | |
3093 | @@ -534,16 +589,16 @@ | |
3094 | p->pidhash_pprev = htable; | |
3095 | } | |
3096 | ||
3097 | -static inline void unhash_pid(struct task_struct *p) | |
3098 | +static inline void unhash_pid(task_t *p) | |
3099 | { | |
3100 | if(p->pidhash_next) | |
3101 | p->pidhash_next->pidhash_pprev = p->pidhash_pprev; | |
3102 | *p->pidhash_pprev = p->pidhash_next; | |
3103 | } | |
3104 | ||
3105 | -static inline struct task_struct *find_task_by_pid(int pid) | |
3106 | +static inline task_t *find_task_by_pid(int pid) | |
3107 | { | |
3108 | - struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)]; | |
3109 | + task_t *p, **htable = &pidhash[pid_hashfn(pid)]; | |
3110 | ||
3111 | for(p = *htable; p && p->pid != pid; p = p->pidhash_next) | |
3112 | ; | |
3113 | @@ -551,19 +606,6 @@ | |
3114 | return p; | |
3115 | } | |
3116 | ||
3117 | -#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL) | |
3118 | - | |
3119 | -static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu) | |
3120 | -{ | |
3121 | - tsk->processor = cpu; | |
3122 | - tsk->cpus_runnable = 1UL << cpu; | |
3123 | -} | |
3124 | - | |
3125 | -static inline void task_release_cpu(struct task_struct *tsk) | |
3126 | -{ | |
3127 | - tsk->cpus_runnable = ~0UL; | |
3128 | -} | |
3129 | - | |
3130 | /* per-UID process charging. */ | |
3131 | extern struct user_struct * alloc_uid(uid_t); | |
3132 | extern void free_uid(struct user_struct *); | |
3133 | @@ -590,7 +632,8 @@ | |
3134 | extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q)); | |
3135 | extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q, | |
3136 | signed long timeout)); | |
3137 | -extern int FASTCALL(wake_up_process(struct task_struct * tsk)); | |
3138 | +extern int FASTCALL(wake_up_process(task_t * tsk)); | |
3139 | +extern void FASTCALL(wake_up_forked_process(task_t * tsk)); | |
3140 | ||
3141 | #define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) | |
3142 | #define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) | |
3143 | @@ -608,28 +651,28 @@ | |
3144 | extern int in_egroup_p(gid_t); | |
3145 | ||
3146 | extern void proc_caches_init(void); | |
3147 | -extern void flush_signals(struct task_struct *); | |
3148 | -extern void flush_signal_handlers(struct task_struct *); | |
3149 | +extern void flush_signals(task_t *); | |
3150 | +extern void flush_signal_handlers(task_t *); | |
3151 | extern int dequeue_signal(sigset_t *, siginfo_t *); | |
3152 | extern void block_all_signals(int (*notifier)(void *priv), void *priv, | |
3153 | sigset_t *mask); | |
3154 | extern void unblock_all_signals(void); | |
3155 | -extern int send_sig_info(int, struct siginfo *, struct task_struct *); | |
3156 | -extern int force_sig_info(int, struct siginfo *, struct task_struct *); | |
3157 | +extern int send_sig_info(int, struct siginfo *, task_t *); | |
3158 | +extern int force_sig_info(int, struct siginfo *, task_t *); | |
3159 | extern int kill_pg_info(int, struct siginfo *, pid_t); | |
3160 | extern int kill_sl_info(int, struct siginfo *, pid_t); | |
3161 | extern int kill_proc_info(int, struct siginfo *, pid_t); | |
3162 | -extern void notify_parent(struct task_struct *, int); | |
3163 | -extern void do_notify_parent(struct task_struct *, int); | |
3164 | -extern void force_sig(int, struct task_struct *); | |
3165 | -extern int send_sig(int, struct task_struct *, int); | |
3166 | +extern void notify_parent(task_t *, int); | |
3167 | +extern void do_notify_parent(task_t *, int); | |
3168 | +extern void force_sig(int, task_t *); | |
3169 | +extern int send_sig(int, task_t *, int); | |
3170 | extern int kill_pg(pid_t, int, int); | |
3171 | extern int kill_sl(pid_t, int, int); | |
3172 | extern int kill_proc(pid_t, int, int); | |
3173 | extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *); | |
3174 | extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long); | |
3175 | ||
3176 | -static inline int signal_pending(struct task_struct *p) | |
3177 | +static inline int signal_pending(task_t *p) | |
3178 | { | |
3179 | return (p->sigpending != 0); | |
3180 | } | |
3181 | @@ -668,7 +711,7 @@ | |
3182 | This is required every time the blocked sigset_t changes. | |
3183 | All callers should have t->sigmask_lock. */ | |
3184 | ||
3185 | -static inline void recalc_sigpending(struct task_struct *t) | |
3186 | +static inline void recalc_sigpending(task_t *t) | |
3187 | { | |
3188 | t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked); | |
3189 | } | |
3190 | @@ -775,16 +818,17 @@ | |
3191 | extern int expand_fdset(struct files_struct *, int nr); | |
3192 | extern void free_fdset(fd_set *, int); | |
3193 | ||
3194 | -extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); | |
3195 | +extern int copy_thread(int, unsigned long, unsigned long, unsigned long, task_t *, struct pt_regs *); | |
3196 | extern void flush_thread(void); | |
3197 | extern void exit_thread(void); | |
3198 | ||
3199 | -extern void exit_mm(struct task_struct *); | |
3200 | -extern void exit_files(struct task_struct *); | |
3201 | -extern void exit_sighand(struct task_struct *); | |
3202 | +extern void exit_mm(task_t *); | |
3203 | +extern void exit_files(task_t *); | |
3204 | +extern void exit_sighand(task_t *); | |
3205 | ||
3206 | extern void reparent_to_init(void); | |
3207 | extern void daemonize(void); | |
3208 | +extern task_t *child_reaper; | |
3209 | ||
3210 | extern int do_execve(char *, char **, char **, struct pt_regs *); | |
3211 | extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long); | |
3212 | @@ -793,6 +837,9 @@ | |
3213 | extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); | |
3214 | extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); | |
3215 | ||
3216 | +extern void wait_task_inactive(task_t * p); | |
3217 | +extern void kick_if_running(task_t * p); | |
3218 | + | |
3219 | #define __wait_event(wq, condition) \ | |
3220 | do { \ | |
3221 | wait_queue_t __wait; \ | |
3222 | @@ -871,24 +918,10 @@ | |
3223 | for (p = &init_task ; (p = p->next_task) != &init_task ; ) | |
3224 | ||
3225 | #define next_thread(p) \ | |
3226 | - list_entry((p)->thread_group.next, struct task_struct, thread_group) | |
3227 | - | |
3228 | -static inline void del_from_runqueue(struct task_struct * p) | |
3229 | -{ | |
3230 | - nr_running--; | |
3231 | - p->sleep_time = jiffies; | |
3232 | - list_del(&p->run_list); | |
3233 | - p->run_list.next = NULL; | |
3234 | -} | |
3235 | - | |
3236 | -static inline int task_on_runqueue(struct task_struct *p) | |
3237 | -{ | |
3238 | - return (p->run_list.next != NULL); | |
3239 | -} | |
3240 | + list_entry((p)->thread_group.next, task_t, thread_group) | |
3241 | ||
3242 | -static inline void unhash_process(struct task_struct *p) | |
3243 | +static inline void unhash_process(task_t *p) | |
3244 | { | |
3245 | - if (task_on_runqueue(p)) BUG(); | |
3246 | write_lock_irq(&tasklist_lock); | |
3247 | nr_threads--; | |
3248 | unhash_pid(p); | |
3249 | @@ -898,12 +931,12 @@ | |
3250 | } | |
3251 | ||
3252 | /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */ | |
3253 | -static inline void task_lock(struct task_struct *p) | |
3254 | +static inline void task_lock(task_t *p) | |
3255 | { | |
3256 | spin_lock(&p->alloc_lock); | |
3257 | } | |
3258 | ||
3259 | -static inline void task_unlock(struct task_struct *p) | |
3260 | +static inline void task_unlock(task_t *p) | |
3261 | { | |
3262 | spin_unlock(&p->alloc_lock); | |
3263 | } | |
3264 | --- linux/include/linux/list.h.orig Sun Jan 6 13:55:57 2002 | |
3265 | +++ linux/include/linux/list.h Mon Jan 28 18:48:00 2002 | |
3266 | @@ -19,6 +19,8 @@ | |
3267 | struct list_head *next, *prev; | |
3268 | }; | |
3269 | ||
3270 | +typedef struct list_head list_t; | |
3271 | + | |
3272 | #define LIST_HEAD_INIT(name) { &(name), &(name) } | |
3273 | ||
3274 | #define LIST_HEAD(name) \ | |
3275 | --- linux/include/linux/kernel_stat.h.orig Tue Aug 21 14:26:23 2001 | |
3276 | +++ linux/include/linux/kernel_stat.h Mon Jan 28 18:48:00 2002 | |
3277 | @@ -32,10 +32,11 @@ | |
3278 | unsigned int ipackets, opackets; | |
3279 | unsigned int ierrors, oerrors; | |
3280 | unsigned int collisions; | |
3281 | - unsigned int context_swtch; | |
3282 | }; | |
3283 | ||
3284 | extern struct kernel_stat kstat; | |
3285 | + | |
3286 | +extern unsigned long nr_context_switches(void); | |
3287 | ||
3288 | #if !defined(CONFIG_ARCH_S390) | |
3289 | /* | |
3290 | --- linux/include/linux/smp.h.orig Sun Dec 31 20:10:17 2000 | |
3291 | +++ linux/include/linux/smp.h Mon Jan 28 18:48:00 2002 | |
3292 | @@ -86,6 +86,14 @@ | |
3293 | #define cpu_number_map(cpu) 0 | |
3294 | #define smp_call_function(func,info,retry,wait) ({ 0; }) | |
3295 | #define cpu_online_map 1 | |
3296 | +static inline void smp_send_reschedule(int cpu) { } | |
3297 | +static inline void smp_send_reschedule_all(void) { } | |
3298 | ||
3299 | #endif | |
3300 | + | |
3301 | +/* | |
3302 | + * Common definitions: | |
3303 | + */ | |
3304 | +#define cpu() smp_processor_id() | |
3305 | + | |
3306 | #endif | |
3307 | --- linux/include/asm-i386/smp.h.orig Sun Jan 6 13:55:57 2002 | |
3308 | +++ linux/include/asm-i386/smp.h Mon Jan 28 18:48:00 2002 | |
3309 | @@ -63,6 +63,7 @@ | |
3310 | extern void smp_flush_tlb(void); | |
3311 | extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); | |
3312 | extern void smp_send_reschedule(int cpu); | |
3313 | +extern void smp_send_reschedule_all(void); | |
3314 | extern void smp_invalidate_rcv(void); /* Process an NMI */ | |
3315 | extern void (*mtrr_hook) (void); | |
3316 | extern void zap_low_mappings (void); | |
3317 | @@ -104,7 +105,7 @@ | |
3318 | * so this is correct in the x86 case. | |
3319 | */ | |
3320 | ||
3321 | -#define smp_processor_id() (current->processor) | |
3322 | +#define smp_processor_id() (current->cpu) | |
3323 | ||
3324 | static __inline int hard_smp_processor_id(void) | |
3325 | { | |
3326 | @@ -121,18 +122,6 @@ | |
3327 | #endif /* !__ASSEMBLY__ */ | |
3328 | ||
3329 | #define NO_PROC_ID 0xFF /* No processor magic marker */ | |
3330 | - | |
3331 | -/* | |
3332 | - * This magic constant controls our willingness to transfer | |
3333 | - * a process across CPUs. Such a transfer incurs misses on the L1 | |
3334 | - * cache, and on a P6 or P5 with multiple L2 caches L2 hits. My | |
3335 | - * gut feeling is this will vary by board in value. For a board | |
3336 | - * with separate L2 cache it probably depends also on the RSS, and | |
3337 | - * for a board with shared L2 cache it ought to decay fast as other | |
3338 | - * processes are run. | |
3339 | - */ | |
3340 | - | |
3341 | -#define PROC_CHANGE_PENALTY 15 /* Schedule penalty */ | |
3342 | ||
3343 | #endif | |
3344 | #endif | |
3345 | --- linux/include/asm-i386/bitops.h.orig Tue Aug 21 14:26:16 2001 | |
3346 | +++ linux/include/asm-i386/bitops.h Mon Jan 28 18:48:00 2002 | |
3347 | @@ -75,6 +75,14 @@ | |
3348 | :"=m" (ADDR) | |
3349 | :"Ir" (nr)); | |
3350 | } | |
3351 | + | |
3352 | +static __inline__ void __clear_bit(int nr, volatile void * addr) | |
3353 | +{ | |
3354 | + __asm__ __volatile__( | |
3355 | + "btrl %1,%0" | |
3356 | + :"=m" (ADDR) | |
3357 | + :"Ir" (nr)); | |
3358 | +} | |
3359 | #define smp_mb__before_clear_bit() barrier() | |
3360 | #define smp_mb__after_clear_bit() barrier() | |
3361 | ||
3362 | @@ -284,6 +292,34 @@ | |
3363 | } | |
3364 | ||
3365 | /** | |
3366 | + * find_first_bit - find the first set bit in a memory region | |
3367 | + * @addr: The address to start the search at | |
3368 | + * @size: The maximum size to search | |
3369 | + * | |
3370 | + * Returns the bit-number of the first set bit, not the number of the byte | |
3371 | + * containing a bit. | |
3372 | + */ | |
3373 | +static __inline__ int find_first_bit(void * addr, unsigned size) | |
3374 | +{ | |
3375 | + int d0, d1; | |
3376 | + int res; | |
3377 | + | |
3378 | + /* This looks at memory. Mark it volatile to tell gcc not to move it around */ | |
3379 | + __asm__ __volatile__( | |
3380 | + "xorl %%eax,%%eax\n\t" | |
3381 | + "repe; scasl\n\t" | |
3382 | + "jz 1f\n\t" | |
3383 | + "leal -4(%%edi),%%edi\n\t" | |
3384 | + "bsfl (%%edi),%%eax\n" | |
3385 | + "1:\tsubl %%ebx,%%edi\n\t" | |
3386 | + "shll $3,%%edi\n\t" | |
3387 | + "addl %%edi,%%eax" | |
3388 | + :"=a" (res), "=&c" (d0), "=&D" (d1) | |
3389 | + :"1" ((size + 31) >> 5), "2" (addr), "b" (addr)); | |
3390 | + return res; | |
3391 | +} | |
3392 | + | |
3393 | +/** | |
3394 | * find_next_zero_bit - find the first zero bit in a memory region | |
3395 | * @addr: The address to base the search on | |
3396 | * @offset: The bitnumber to start searching at | |
3397 | @@ -296,7 +332,7 @@ | |
3398 | ||
3399 | if (bit) { | |
3400 | /* | |
3401 | - * Look for zero in first byte | |
3402 | + * Look for zero in the first 32 bits. | |
3403 | */ | |
3404 | __asm__("bsfl %1,%0\n\t" | |
3405 | "jne 1f\n\t" | |
3406 | @@ -317,6 +353,39 @@ | |
3407 | } | |
3408 | ||
3409 | /** | |
3410 | + * find_next_bit - find the first set bit in a memory region | |
3411 | + * @addr: The address to base the search on | |
3412 | + * @offset: The bitnumber to start searching at | |
3413 | + * @size: The maximum size to search | |
3414 | + */ | |
3415 | +static __inline__ int find_next_bit (void * addr, int size, int offset) | |
3416 | +{ | |
3417 | + unsigned long * p = ((unsigned long *) addr) + (offset >> 5); | |
3418 | + int set = 0, bit = offset & 31, res; | |
3419 | + | |
3420 | + if (bit) { | |
3421 | + /* | |
3422 | + * Look for nonzero in the first 32 bits: | |
3423 | + */ | |
3424 | + __asm__("bsfl %1,%0\n\t" | |
3425 | + "jne 1f\n\t" | |
3426 | + "movl $32, %0\n" | |
3427 | + "1:" | |
3428 | + : "=r" (set) | |
3429 | + : "r" (*p >> bit)); | |
3430 | + if (set < (32 - bit)) | |
3431 | + return set + offset; | |
3432 | + set = 32 - bit; | |
3433 | + p++; | |
3434 | + } | |
3435 | + /* | |
3436 | + * No set bit yet, search remaining full words for a bit | |
3437 | + */ | |
3438 | + res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr)); | |
3439 | + return (offset + set + res); | |
3440 | +} | |
3441 | + | |
3442 | +/** | |
3443 | * ffz - find first zero in word. | |
3444 | * @word: The word to search | |
3445 | * | |
3446 | @@ -327,6 +396,20 @@ | |
3447 | __asm__("bsfl %1,%0" | |
3448 | :"=r" (word) | |
3449 | :"r" (~word)); | |
3450 | + return word; | |
3451 | +} | |
3452 | + | |
3453 | +/** | |
3454 | + * __ffs - find first bit in word. | |
3455 | + * @word: The word to search | |
3456 | + * | |
3457 | + * Undefined if no bit exists, so code should check against 0 first. | |
3458 | + */ | |
3459 | +static __inline__ unsigned long __ffs(unsigned long word) | |
3460 | +{ | |
3461 | + __asm__("bsfl %1,%0" | |
3462 | + :"=r" (word) | |
3463 | + :"rm" (word)); | |
3464 | return word; | |
3465 | } | |
3466 | ||
3467 | --- linux/include/asm-i386/pgalloc.h.orig Sun Jan 6 13:55:57 2002 | |
3468 | +++ linux/include/asm-i386/pgalloc.h Mon Jan 28 18:48:00 2002 | |
3469 | @@ -224,6 +224,7 @@ | |
3470 | { | |
3471 | struct mm_struct *active_mm; | |
3472 | int state; | |
3473 | + char __cacheline_padding[24]; | |
3474 | }; | |
3475 | extern struct tlb_state cpu_tlbstate[NR_CPUS]; | |
3476 | ||
3477 | --- linux/include/asm-i386/mmu_context.h.orig Tue Aug 21 14:26:23 2001 | |
3478 | +++ linux/include/asm-i386/mmu_context.h Mon Jan 28 18:48:00 2002 | |
3479 | @@ -7,6 +7,31 @@ | |
3480 | #include <asm/pgalloc.h> | |
3481 | ||
3482 | /* | |
3483 | + * Every architecture must define this function. It's the fastest | |
3484 | + * way of searching a 168-bit bitmap where the first 128 bits are | |
3485 | + * unlikely to be set. It's guaranteed that at least one of the 168 | |
3486 | + * bits is cleared. | |
3487 | + */ | |
3488 | +#if MAX_RT_PRIO != 128 || MAX_PRIO != 168 | |
3489 | +# error update this function. | |
3490 | +#endif | |
3491 | + | |
3492 | +static inline int sched_find_first_bit(unsigned long *b) | |
3493 | +{ | |
3494 | + if (unlikely(b[0])) | |
3495 | + return __ffs(b[0]); | |
3496 | + if (unlikely(b[1])) | |
3497 | + return __ffs(b[1]) + 32; | |
3498 | + if (unlikely(b[2])) | |
3499 | + return __ffs(b[2]) + 64; | |
3500 | + if (unlikely(b[3])) | |
3501 | + return __ffs(b[3]) + 96; | |
3502 | + if (b[4]) | |
3503 | + return __ffs(b[4]) + 128; | |
3504 | + return __ffs(b[5]) + 32 + 128; | |
3505 | +} | |
3506 | + | |
3507 | +/* | |
3508 | * possibly do the LDT unload here? | |
3509 | */ | |
3510 | #define destroy_context(mm) do { } while(0) | |
3511 | @@ -27,13 +52,13 @@ | |
3512 | ||
3513 | static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu) | |
3514 | { | |
3515 | - if (prev != next) { | |
3516 | + if (likely(prev != next)) { | |
3517 | /* stop flush ipis for the previous mm */ | |
3518 | clear_bit(cpu, &prev->cpu_vm_mask); | |
3519 | /* | |
3520 | * Re-load LDT if necessary | |
3521 | */ | |
3522 | - if (prev->context.segments != next->context.segments) | |
3523 | + if (unlikely(prev->context.segments != next->context.segments)) | |
3524 | load_LDT(next); | |
3525 | #ifdef CONFIG_SMP | |
3526 | cpu_tlbstate[cpu].state = TLBSTATE_OK; | |
3527 | --- linux/include/asm-i386/hw_irq.h.orig Wed Jan 16 21:44:01 2002 | |
3528 | +++ linux/include/asm-i386/hw_irq.h Mon Jan 28 18:48:00 2002 | |
3529 | @@ -41,8 +41,9 @@ | |
3530 | #define ERROR_APIC_VECTOR 0xfe | |
3531 | #define INVALIDATE_TLB_VECTOR 0xfd | |
3532 | #define RESCHEDULE_VECTOR 0xfc | |
3533 | -#define CALL_FUNCTION_VECTOR 0xfb | |
3534 | -#define KDB_VECTOR 0xfa | |
3535 | +#define TASK_MIGRATION_VECTOR 0xfb | |
3536 | +#define CALL_FUNCTION_VECTOR 0xfa | |
3537 | +#define KDB_VECTOR 0xf9 | |
3538 | ||
3539 | /* | |
3540 | * Local APIC timer IRQ vector is on a different priority level, | |
3541 | --- linux/include/asm-i386/apic.h.orig Mon Jan 28 18:05:10 2002 | |
3542 | +++ linux/include/asm-i386/apic.h Mon Jan 28 18:48:00 2002 | |
3543 | @@ -79,6 +79,8 @@ | |
3544 | extern void setup_apic_nmi_watchdog (void); | |
3545 | extern inline void nmi_watchdog_tick (struct pt_regs * regs); | |
3546 | extern int APIC_init_uniprocessor (void); | |
3547 | +extern void disable_APIC_timer(void); | |
3548 | +extern void enable_APIC_timer(void); | |
3549 | ||
3550 | extern struct pm_dev *apic_pm_register(pm_dev_t, unsigned long, pm_callback); | |
3551 | extern void apic_pm_unregister(struct pm_dev*); | |
3552 | --- linux/net/unix/af_unix.c.orig Sun Jan 6 13:55:58 2002 | |
3553 | +++ linux/net/unix/af_unix.c Sun Jan 6 13:56:25 2002 | |
3554 | @@ -564,10 +564,8 @@ | |
3555 | addr->hash)) { | |
3556 | write_unlock(&unix_table_lock); | |
3557 | /* Sanity yield. It is unusual case, but yet... */ | |
3558 | - if (!(ordernum&0xFF)) { | |
3559 | - current->policy |= SCHED_YIELD; | |
3560 | - schedule(); | |
3561 | - } | |
3562 | + if (!(ordernum&0xFF)) | |
3563 | + yield(); | |
3564 | goto retry; | |
3565 | } | |
3566 | addr->hash ^= sk->type; | |
3567 | --- linux/net/ipv4/tcp_output.c.orig Sun Jan 6 13:55:57 2002 | |
3568 | +++ linux/net/ipv4/tcp_output.c Sun Jan 6 13:56:25 2002 | |
3569 | @@ -1009,8 +1009,7 @@ | |
3570 | skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL); | |
3571 | if (skb) | |
3572 | break; | |
3573 | - current->policy |= SCHED_YIELD; | |
3574 | - schedule(); | |
3575 | + yield(); | |
3576 | } | |
3577 | ||
3578 | /* Reserve space for headers and prepare control bits. */ | |
3579 | --- linux/net/sunrpc/sched.c.orig Sun Jan 6 13:55:52 2002 | |
3580 | +++ linux/net/sunrpc/sched.c Sun Jan 6 13:56:25 2002 | |
3581 | @@ -772,8 +772,7 @@ | |
3582 | } | |
3583 | if (flags & RPC_TASK_ASYNC) | |
3584 | return NULL; | |
3585 | - current->policy |= SCHED_YIELD; | |
3586 | - schedule(); | |
3587 | + yield(); | |
3588 | } while (!signalled()); | |
3589 | ||
3590 | return NULL; | |
3591 | @@ -1114,8 +1113,7 @@ | |
3592 | __rpc_schedule(); | |
3593 | if (all_tasks) { | |
3594 | dprintk("rpciod_killall: waiting for tasks to exit\n"); | |
3595 | - current->policy |= SCHED_YIELD; | |
3596 | - schedule(); | |
3597 | + yield(); | |
3598 | } | |
3599 | } | |
3600 | ||
3601 | @@ -1185,8 +1183,7 @@ | |
3602 | * wait briefly before checking the process id. | |
3603 | */ | |
3604 | current->sigpending = 0; | |
3605 | - current->policy |= SCHED_YIELD; | |
3606 | - schedule(); | |
3607 | + yield(); | |
3608 | /* | |
3609 | * Display a message if we're going to wait longer. | |
3610 | */ | |
3611 | --- linux/net/sched/sch_generic.c.orig Fri Aug 18 19:26:25 2000 | |
3612 | +++ linux/net/sched/sch_generic.c Sun Jan 6 13:56:25 2002 | |
3613 | @@ -475,10 +475,8 @@ | |
3614 | ||
3615 | dev_watchdog_down(dev); | |
3616 | ||
3617 | - while (test_bit(__LINK_STATE_SCHED, &dev->state)) { | |
3618 | - current->policy |= SCHED_YIELD; | |
3619 | - schedule(); | |
3620 | - } | |
3621 | + while (test_bit(__LINK_STATE_SCHED, &dev->state)) | |
3622 | + yield(); | |
3623 | ||
3624 | spin_unlock_wait(&dev->xmit_lock); | |
3625 | } | |
3626 | --- linux/net/socket.c.orig Sun Jan 6 13:55:58 2002 | |
3627 | +++ linux/net/socket.c Sun Jan 6 13:56:25 2002 | |
3628 | @@ -148,8 +148,7 @@ | |
3629 | while (atomic_read(&net_family_lockct) != 0) { | |
3630 | spin_unlock(&net_family_lock); | |
3631 | ||
3632 | - current->policy |= SCHED_YIELD; | |
3633 | - schedule(); | |
3634 | + yield(); | |
3635 | ||
3636 | spin_lock(&net_family_lock); | |
3637 | } | |
3638 | --- linux/drivers/net/slip.c.orig Sun Jan 6 13:55:48 2002 | |
3639 | +++ linux/drivers/net/slip.c Sun Jan 6 13:56:25 2002 | |
3640 | @@ -1393,10 +1393,8 @@ | |
3641 | /* First of all: check for active disciplines and hangup them. | |
3642 | */ | |
3643 | do { | |
3644 | - if (busy) { | |
3645 | - current->counter = 0; | |
3646 | - schedule(); | |
3647 | - } | |
3648 | + if (busy) | |
3649 | + sys_sched_yield(); | |
3650 | ||
3651 | busy = 0; | |
3652 | local_bh_disable(); | |
3653 | --- linux/drivers/block/loop.c.orig Sun Jan 6 13:55:56 2002 | |
3654 | +++ linux/drivers/block/loop.c Sun Jan 6 13:56:25 2002 | |
3655 | @@ -570,9 +570,6 @@ | |
3656 | flush_signals(current); | |
3657 | spin_unlock_irq(¤t->sigmask_lock); | |
3658 | ||
3659 | - current->policy = SCHED_OTHER; | |
3660 | - current->nice = -20; | |
3661 | - | |
3662 | spin_lock_irq(&lo->lo_lock); | |
3663 | lo->lo_state = Lo_bound; | |
3664 | atomic_inc(&lo->lo_pending); | |
3665 | --- linux/drivers/char/mwave/mwavedd.c.orig Sun Jan 13 16:27:41 2002 | |
3666 | +++ linux/drivers/char/mwave/mwavedd.c Sun Jan 13 16:28:05 2002 | |
3667 | @@ -279,7 +279,6 @@ | |
3668 | pDrvData->IPCs[ipcnum].bIsHere = FALSE; | |
3669 | pDrvData->IPCs[ipcnum].bIsEnabled = TRUE; | |
3670 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0) | |
3671 | - current->nice = -20; /* boost to provide priority timing */ | |
3672 | #else | |
3673 | current->priority = 0x28; /* boost to provide priority timing */ | |
3674 | #endif | |
3675 | --- linux/drivers/ide/ataraid.c.orig Sun Jan 6 13:55:52 2002 | |
3676 | +++ linux/drivers/ide/ataraid.c Fri Jan 25 14:27:38 2002 | |
3677 | @@ -123,8 +123,7 @@ | |
3678 | ptr=kmalloc(sizeof(struct buffer_head),GFP_NOIO); | |
3679 | if (!ptr) { | |
3680 | __set_current_state(TASK_RUNNING); | |
3681 | - current->policy |= SCHED_YIELD; | |
3682 | - schedule(); | |
3683 | + yield(); | |
3684 | } | |
3685 | } | |
3686 | return ptr; | |
3687 | @@ -139,8 +138,7 @@ | |
3688 | ptr=kmalloc(sizeof(struct ataraid_bh_private),GFP_NOIO); | |
3689 | if (!ptr) { | |
3690 | __set_current_state(TASK_RUNNING); | |
3691 | - current->policy |= SCHED_YIELD; | |
3692 | - schedule(); | |
3693 | + yield(); | |
3694 | } | |
3695 | } | |
3696 | return ptr; | |
3697 | --- linux/drivers/md/md.c.orig Sun Jan 6 13:55:56 2002 | |
3698 | +++ linux/drivers/md/md.c Sun Jan 6 13:56:25 2002 | |
3699 | @@ -2930,8 +2930,6 @@ | |
3700 | * bdflush, otherwise bdflush will deadlock if there are too | |
3701 | * many dirty RAID5 blocks. | |
3702 | */ | |
3703 | - current->policy = SCHED_OTHER; | |
3704 | - current->nice = -20; | |
3705 | md_unlock_kernel(); | |
3706 | ||
3707 | complete(thread->event); | |
3708 | @@ -3381,11 +3379,6 @@ | |
3709 | "(but not more than %d KB/sec) for reconstruction.\n", | |
3710 | sysctl_speed_limit_max); | |
3711 | ||
3712 | - /* | |
3713 | - * Resync has low priority. | |
3714 | - */ | |
3715 | - current->nice = 19; | |
3716 | - | |
3717 | is_mddev_idle(mddev); /* this also initializes IO event counters */ | |
3718 | for (m = 0; m < SYNC_MARKS; m++) { | |
3719 | mark[m] = jiffies; | |
3720 | @@ -3463,16 +3456,13 @@ | |
3721 | currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; | |
3722 | ||
3723 | if (currspeed > sysctl_speed_limit_min) { | |
3724 | - current->nice = 19; | |
3725 | - | |
3726 | if ((currspeed > sysctl_speed_limit_max) || | |
3727 | !is_mddev_idle(mddev)) { | |
3728 | current->state = TASK_INTERRUPTIBLE; | |
3729 | md_schedule_timeout(HZ/4); | |
3730 | goto repeat; | |
3731 | } | |
3732 | - } else | |
3733 | - current->nice = -20; | |
3734 | + } | |
3735 | } | |
3736 | printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); | |
3737 | err = 0; | |
3738 | --- linux/arch/i386/mm/fault.c.orig Sun Jan 6 13:55:47 2002 | |
3739 | +++ linux/arch/i386/mm/fault.c Sun Jan 6 13:56:25 2002 | |
3740 | @@ -88,8 +88,7 @@ | |
3741 | ||
3742 | out_of_memory: | |
3743 | if (current->pid == 1) { | |
3744 | - current->policy |= SCHED_YIELD; | |
3745 | - schedule(); | |
3746 | + yield(); | |
3747 | goto survive; | |
3748 | } | |
3749 | goto bad_area; | |
3750 | @@ -344,8 +343,7 @@ | |
3751 | out_of_memory: | |
3752 | up_read(&mm->mmap_sem); | |
3753 | if (tsk->pid == 1) { | |
3754 | - tsk->policy |= SCHED_YIELD; | |
3755 | - schedule(); | |
3756 | + yield(); | |
3757 | down_read(&mm->mmap_sem); | |
3758 | goto survive; | |
3759 | } | |
3760 | --- linux/arch/i386/kernel/smpboot.c.orig Sun Jan 6 13:55:56 2002 | |
3761 | +++ linux/arch/i386/kernel/smpboot.c Mon Jan 28 18:12:21 2002 | |
3762 | @@ -308,14 +308,14 @@ | |
3763 | if (tsc_values[i] < avg) | |
3764 | realdelta = -realdelta; | |
3765 | ||
3766 | - printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", | |
3767 | - i, realdelta); | |
3768 | + printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", i, realdelta); | |
3769 | } | |
3770 | ||
3771 | sum += delta; | |
3772 | } | |
3773 | if (!buggy) | |
3774 | printk("passed.\n"); | |
3775 | + ; | |
3776 | } | |
3777 | ||
3778 | static void __init synchronize_tsc_ap (void) | |
3779 | @@ -365,7 +365,7 @@ | |
3780 | * (This works even if the APIC is not enabled.) | |
3781 | */ | |
3782 | phys_id = GET_APIC_ID(apic_read(APIC_ID)); | |
3783 | - cpuid = current->processor; | |
3784 | + cpuid = cpu(); | |
3785 | if (test_and_set_bit(cpuid, &cpu_online_map)) { | |
3786 | printk("huh, phys CPU#%d, CPU#%d already present??\n", | |
3787 | phys_id, cpuid); | |
3788 | @@ -435,6 +435,7 @@ | |
3789 | */ | |
3790 | smp_store_cpu_info(cpuid); | |
3791 | ||
3792 | + disable_APIC_timer(); | |
3793 | /* | |
3794 | * Allow the master to continue. | |
3795 | */ | |
3796 | @@ -465,6 +466,7 @@ | |
3797 | smp_callin(); | |
3798 | while (!atomic_read(&smp_commenced)) | |
3799 | rep_nop(); | |
3800 | + enable_APIC_timer(); | |
3801 | /* | |
3802 | * low-memory mappings have been cleared, flush them from | |
3803 | * the local TLBs too. | |
3804 | @@ -803,16 +805,13 @@ | |
3805 | if (!idle) | |
3806 | panic("No idle process for CPU %d", cpu); | |
3807 | ||
3808 | - idle->processor = cpu; | |
3809 | - idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */ | |
3810 | + init_idle(idle, cpu); | |
3811 | ||
3812 | map_cpu_to_boot_apicid(cpu, apicid); | |
3813 | ||
3814 | idle->thread.eip = (unsigned long) start_secondary; | |
3815 | ||
3816 | - del_from_runqueue(idle); | |
3817 | unhash_process(idle); | |
3818 | - init_tasks[cpu] = idle; | |
3819 | ||
3820 | /* start_eip had better be page-aligned! */ | |
3821 | start_eip = setup_trampoline(); | |
3822 | @@ -925,6 +924,7 @@ | |
3823 | } | |
3824 | ||
3825 | cycles_t cacheflush_time; | |
3826 | +unsigned long cache_decay_ticks; | |
3827 | ||
3828 | static void smp_tune_scheduling (void) | |
3829 | { | |
3830 | @@ -958,9 +958,13 @@ | |
3831 | cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth; | |
3832 | } | |
3833 | ||
3834 | + cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000; | |
3835 | + | |
3836 | printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n", | |
3837 | (long)cacheflush_time/(cpu_khz/1000), | |
3838 | ((long)cacheflush_time*100/(cpu_khz/1000)) % 100); | |
3839 | + printk("task migration cache decay timeout: %ld msecs.\n", | |
3840 | + (cache_decay_ticks + 1) * 1000 / HZ); | |
3841 | } | |
3842 | ||
3843 | /* | |
3844 | @@ -1020,8 +1024,7 @@ | |
3845 | map_cpu_to_boot_apicid(0, boot_cpu_apicid); | |
3846 | ||
3847 | global_irq_holder = 0; | |
3848 | - current->processor = 0; | |
3849 | - init_idle(); | |
3850 | + current->cpu = 0; | |
3851 | smp_tune_scheduling(); | |
3852 | ||
3853 | /* | |
3854 | --- linux/arch/i386/kernel/process.c.orig Mon Jan 28 18:09:58 2002 | |
3855 | +++ linux/arch/i386/kernel/process.c Mon Jan 28 18:09:53 2002 | |
3856 | @@ -123,15 +123,12 @@ | |
3857 | void cpu_idle (void) | |
3858 | { | |
3859 | /* endless idle loop with no priority at all */ | |
3860 | - init_idle(); | |
3861 | - current->nice = 20; | |
3862 | - current->counter = -100; | |
3863 | ||
3864 | while (1) { | |
3865 | void (*idle)(void) = pm_idle; | |
3866 | if (!idle) | |
3867 | idle = default_idle; | |
3868 | - while (!current->need_resched) | |
3869 | + if (!current->need_resched) | |
3870 | idle(); | |
3871 | schedule(); | |
3872 | check_pgt_cache(); | |
3873 | @@ -694,15 +691,17 @@ | |
3874 | asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs)); | |
3875 | ||
3876 | /* | |
3877 | - * Restore %fs and %gs. | |
3878 | + * Restore %fs and %gs if needed. | |
3879 | */ | |
3880 | - loadsegment(fs, next->fs); | |
3881 | - loadsegment(gs, next->gs); | |
3882 | + if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) { | |
3883 | + loadsegment(fs, next->fs); | |
3884 | + loadsegment(gs, next->gs); | |
3885 | + } | |
3886 | ||
3887 | /* | |
3888 | * Now maybe reload the debug registers | |
3889 | */ | |
3890 | - if (next->debugreg[7]){ | |
3891 | + if (unlikely(next->debugreg[7])) { | |
3892 | loaddebug(next, 0); | |
3893 | loaddebug(next, 1); | |
3894 | loaddebug(next, 2); | |
3895 | @@ -712,7 +711,7 @@ | |
3896 | loaddebug(next, 7); | |
3897 | } | |
3898 | ||
3899 | - if (prev->ioperm || next->ioperm) { | |
3900 | + if (unlikely(prev->ioperm || next->ioperm)) { | |
3901 | if (next->ioperm) { | |
3902 | /* | |
3903 | * 4 cachelines copy ... not good, but not that | |
3904 | --- linux/arch/i386/kernel/apic.c.orig Sun Jan 6 13:55:54 2002 | |
3905 | +++ linux/arch/i386/kernel/apic.c Mon Jan 28 18:07:11 2002 | |
3906 | @@ -785,8 +785,7 @@ | |
3907 | */ | |
3908 | ||
3909 | slice = clocks / (smp_num_cpus+1); | |
3910 | - printk("cpu: %d, clocks: %d, slice: %d\n", | |
3911 | - smp_processor_id(), clocks, slice); | |
3912 | + printk("cpu: %d, clocks: %d, slice: %d\n", smp_processor_id(), clocks, slice); | |
3913 | ||
3914 | /* | |
3915 | * Wait for IRQ0's slice: | |
3916 | @@ -809,8 +808,7 @@ | |
3917 | ||
3918 | __setup_APIC_LVTT(clocks); | |
3919 | ||
3920 | - printk("CPU%d<T0:%d,T1:%d,D:%d,S:%d,C:%d>\n", | |
3921 | - smp_processor_id(), t0, t1, delta, slice, clocks); | |
3922 | + printk("CPU%d<T0:%d,T1:%d,D:%d,S:%d,C:%d>\n", smp_processor_id(), t0, t1, delta, slice, clocks); | |
3923 | ||
3924 | __restore_flags(flags); | |
3925 | } | |
3926 | @@ -911,6 +909,26 @@ | |
3927 | ||
3928 | /* and update all other cpus */ | |
3929 | smp_call_function(setup_APIC_timer, (void *)calibration_result, 1, 1); | |
3930 | +} | |
3931 | + | |
3932 | +void __init disable_APIC_timer(void) | |
3933 | +{ | |
3934 | + if (using_apic_timer) { | |
3935 | + unsigned long v; | |
3936 | + | |
3937 | + v = apic_read(APIC_LVTT); | |
3938 | + apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); | |
3939 | + } | |
3940 | +} | |
3941 | + | |
3942 | +void enable_APIC_timer(void) | |
3943 | +{ | |
3944 | + if (using_apic_timer) { | |
3945 | + unsigned long v; | |
3946 | + | |
3947 | + v = apic_read(APIC_LVTT); | |
3948 | + apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED); | |
3949 | + } | |
3950 | } | |
3951 | ||
3952 | /* | |
3953 | --- linux/arch/i386/kernel/nmi.c.orig Sun Jan 6 13:55:43 2002 | |
3954 | +++ linux/arch/i386/kernel/nmi.c Sun Jan 6 13:56:25 2002 | |
3955 | @@ -283,7 +283,7 @@ | |
3956 | * to get a message out. | |
3957 | */ | |
3958 | bust_spinlocks(1); | |
3959 | - printk("NMI Watchdog detected LOCKUP on CPU%d, registers:\n", cpu); | |
3960 | + printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip); | |
3961 | show_registers(regs); | |
3962 | printk("console shuts up ...\n"); | |
3963 | console_silent(); | |
3964 | --- linux/arch/i386/kernel/smp.c.orig Sun Jan 6 13:55:56 2002 | |
3965 | +++ linux/arch/i386/kernel/smp.c Wed Jan 16 21:42:45 2002 | |
3966 | @@ -105,7 +105,7 @@ | |
3967 | /* The 'big kernel lock' */ | |
3968 | spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; | |
3969 | ||
3970 | -struct tlb_state cpu_tlbstate[NR_CPUS] = {[0 ... NR_CPUS-1] = { &init_mm, 0 }}; | |
3971 | +struct tlb_state cpu_tlbstate[NR_CPUS] __cacheline_aligned = {[0 ... NR_CPUS-1] = { &init_mm, 0, }}; | |
3972 | ||
3973 | /* | |
3974 | * the following functions deal with sending IPIs between CPUs. | |
3975 | @@ -485,15 +485,54 @@ | |
3976 | do_flush_tlb_all_local(); | |
3977 | } | |
3978 | ||
3979 | +static spinlock_t migration_lock = SPIN_LOCK_UNLOCKED; | |
3980 | +static task_t *new_task; | |
3981 | + | |
3982 | +/* | |
3983 | + * This function sends a 'task migration' IPI to another CPU. | |
3984 | + * Must be called from syscall contexts, with interrupts *enabled*. | |
3985 | + */ | |
3986 | +void smp_migrate_task(int cpu, task_t *p) | |
3987 | +{ | |
3988 | + /* | |
3989 | + * The target CPU will unlock the migration spinlock: | |
3990 | + */ | |
3991 | + spin_lock(&migration_lock); | |
3992 | + new_task = p; | |
3993 | + send_IPI_mask(1 << cpu, TASK_MIGRATION_VECTOR); | |
3994 | +} | |
3995 | + | |
3996 | +/* | |
3997 | + * Task migration callback. | |
3998 | + */ | |
3999 | +asmlinkage void smp_task_migration_interrupt(void) | |
4000 | +{ | |
4001 | + task_t *p; | |
4002 | + | |
4003 | + ack_APIC_irq(); | |
4004 | + p = new_task; | |
4005 | + spin_unlock(&migration_lock); | |
4006 | + sched_task_migrated(p); | |
4007 | +} | |
4008 | /* | |
4009 | * this function sends a 'reschedule' IPI to another CPU. | |
4010 | * it goes straight through and wastes no time serializing | |
4011 | * anything. Worst case is that we lose a reschedule ... | |
4012 | */ | |
4013 | - | |
4014 | void smp_send_reschedule(int cpu) | |
4015 | { | |
4016 | send_IPI_mask(1 << cpu, RESCHEDULE_VECTOR); | |
4017 | +} | |
4018 | + | |
4019 | +/* | |
4020 | + * this function sends a reschedule IPI to all (other) CPUs. | |
4021 | + * This should only be used if some 'global' task became runnable, | |
4022 | + * such as a RT task, that must be handled now. The first CPU | |
4023 | + * that manages to grab the task will run it. | |
4024 | + */ | |
4025 | +void smp_send_reschedule_all(void) | |
4026 | +{ | |
4027 | + send_IPI_allbutself(RESCHEDULE_VECTOR); | |
4028 | } | |
4029 | ||
4030 | /* | |
4031 | --- linux/arch/i386/kernel/i8259.c.orig Wed Jan 16 21:43:09 2002 | |
4032 | +++ linux/arch/i386/kernel/i8259.c Fri Jan 18 15:36:35 2002 | |
4033 | @@ -79,6 +79,7 @@ | |
4034 | * through the ICC by us (IPIs) | |
4035 | */ | |
4036 | #ifdef CONFIG_SMP | |
4037 | +BUILD_SMP_INTERRUPT(task_migration_interrupt,TASK_MIGRATION_VECTOR) | |
4038 | BUILD_SMP_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) | |
4039 | BUILD_SMP_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR) | |
4040 | BUILD_SMP_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) | |
4041 | @@ -472,6 +473,9 @@ | |
4042 | * IPI, driven by wakeup. | |
4043 | */ | |
4044 | set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); | |
4045 | + | |
4046 | + /* IPI for task migration */ | |
4047 | + set_intr_gate(TASK_MIGRATION_VECTOR, task_migration_interrupt); | |
4048 | ||
4049 | /* IPI for invalidation */ | |
4050 | set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); | |
4051 | --- linux/arch/i386/kernel/entry.S.orig Fri Jan 25 14:30:36 2002 | |
4052 | +++ linux/arch/i386/kernel/entry.S Fri Jan 25 14:30:50 2002 | |
4053 | @@ -77,7 +77,7 @@ | |
4054 | exec_domain = 16 | |
4055 | need_resched = 20 | |
4056 | tsk_ptrace = 24 | |
4057 | -processor = 52 | |
4058 | +cpu = 32 | |
4059 | ||
4060 | ENOSYS = 38 | |
4061 | ||
4062 | --- linux/arch/i386/kernel/setup.c.orig Mon Jan 28 18:10:23 2002 | |
4063 | +++ linux/arch/i386/kernel/setup.c Mon Jan 28 18:10:48 2002 | |
4064 | @@ -2922,9 +2922,10 @@ | |
4065 | load_TR(nr); | |
4066 | load_LDT(&init_mm); | |
4067 | ||
4068 | - /* | |
4069 | - * Clear all 6 debug registers: | |
4070 | - */ | |
4071 | + /* Clear %fs and %gs. */ | |
4072 | + asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); | |
4073 | + | |
4074 | + /* Clear all 6 debug registers: */ | |
4075 | ||
4076 | #define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) ); | |
4077 |