]>
Commit | Line | Data |
---|---|---|
109f8d43 | 1 | --- linux/fs/proc/proc_misc.c.orig Tue Feb 5 13:51:49 2002 |
2 | +++ linux/fs/proc/proc_misc.c Tue Feb 5 13:52:12 2002 | |
3 | @@ -85,11 +85,11 @@ | |
4 | a = avenrun[0] + (FIXED_1/200); | |
5 | b = avenrun[1] + (FIXED_1/200); | |
6 | c = avenrun[2] + (FIXED_1/200); | |
7 | - len = sprintf(page,"%d.%02d %d.%02d %d.%02d %d/%d %d\n", | |
8 | + len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n", | |
9 | LOAD_INT(a), LOAD_FRAC(a), | |
10 | LOAD_INT(b), LOAD_FRAC(b), | |
11 | LOAD_INT(c), LOAD_FRAC(c), | |
12 | - nr_running, nr_threads, last_pid); | |
13 | + nr_running(), nr_threads, last_pid); | |
14 | return proc_calc_metrics(page, start, off, count, eof, len); | |
15 | } | |
16 | ||
17 | @@ -101,7 +101,7 @@ | |
18 | int len; | |
19 | ||
20 | uptime = jiffies; | |
21 | - idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime; | |
22 | + idle = init_task.times.tms_utime + init_task.times.tms_stime; | |
23 | ||
24 | /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but | |
25 | that would overflow about every five days at HZ == 100. | |
26 | @@ -303,10 +303,10 @@ | |
27 | } | |
28 | ||
29 | len += sprintf(page + len, | |
30 | - "\nctxt %u\n" | |
31 | + "\nctxt %lu\n" | |
32 | "btime %lu\n" | |
33 | "processes %lu\n", | |
34 | - kstat.context_swtch, | |
35 | + nr_context_switches(), | |
36 | xtime.tv_sec - jif / HZ, | |
37 | total_forks); | |
38 | ||
39 | --- linux/fs/proc/array.c.orig Tue Feb 5 13:51:45 2002 | |
40 | +++ linux/fs/proc/array.c Tue Feb 5 13:52:12 2002 | |
41 | @@ -335,9 +335,8 @@ | |
42 | ||
43 | /* scale priority and nice values from timeslices to -20..20 */ | |
44 | /* to make it look like a "normal" Unix priority/nice value */ | |
45 | - priority = task->counter; | |
46 | - priority = 20 - (priority * 10 + DEF_COUNTER / 2) / DEF_COUNTER; | |
47 | - nice = task->nice; | |
48 | + priority = task_prio(task); | |
49 | + nice = task_nice(task); | |
50 | ||
51 | read_lock(&tasklist_lock); | |
52 | ppid = task->pid ? task->p_opptr->pid : 0; | |
53 | @@ -387,7 +386,7 @@ | |
54 | task->nswap, | |
55 | task->cnswap, | |
56 | task->exit_signal, | |
57 | - task->processor); | |
58 | + task->cpu); | |
59 | if(mm) | |
60 | mmput(mm); | |
61 | return res; | |
62 | --- linux/fs/nfs/pagelist.c.orig Tue Feb 5 13:51:50 2002 | |
63 | +++ linux/fs/nfs/pagelist.c Tue Feb 5 13:52:12 2002 | |
64 | @@ -96,8 +96,7 @@ | |
65 | continue; | |
66 | if (signalled() && (server->flags & NFS_MOUNT_INTR)) | |
67 | return ERR_PTR(-ERESTARTSYS); | |
68 | - current->policy = SCHED_YIELD; | |
69 | - schedule(); | |
70 | + yield(); | |
71 | } | |
72 | ||
73 | /* Initialize the request struct. Initially, we assume a | |
74 | --- linux/fs/ufs/truncate.c.orig Tue Feb 5 13:51:53 2002 | |
75 | +++ linux/fs/ufs/truncate.c Tue Feb 5 13:52:12 2002 | |
76 | @@ -448,10 +448,7 @@ | |
77 | if (IS_SYNC(inode) && (inode->i_state & I_DIRTY)) | |
78 | ufs_sync_inode (inode); | |
79 | run_task_queue(&tq_disk); | |
80 | - current->policy |= SCHED_YIELD; | |
81 | - schedule (); | |
82 | - | |
83 | - | |
84 | + yield(); | |
85 | } | |
86 | offset = inode->i_size & uspi->s_fshift; | |
87 | if (offset) { | |
88 | --- linux/fs/reiserfs/buffer2.c.orig Tue Feb 5 13:51:51 2002 | |
89 | +++ linux/fs/reiserfs/buffer2.c Tue Feb 5 13:52:12 2002 | |
90 | @@ -33,8 +33,7 @@ | |
91 | buffer_journal_dirty(bh) ? ' ' : '!'); | |
92 | } | |
93 | run_task_queue(&tq_disk); | |
94 | - current->policy |= SCHED_YIELD; | |
95 | - schedule(); | |
96 | + yield(); | |
97 | } | |
98 | if (repeat_counter > 30000000) { | |
99 | reiserfs_warning("vs-3051: done waiting, ignore vs-3050 messages for (%b)\n", bh) ; | |
100 | @@ -52,11 +51,11 @@ | |
101 | struct buffer_head * reiserfs_bread (struct super_block *super, int n_block, int n_size) | |
102 | { | |
103 | struct buffer_head *result; | |
104 | - PROC_EXP( unsigned int ctx_switches = kstat.context_swtch ); | |
105 | + PROC_EXP( unsigned int ctx_switches = nr_context_switches(); ); | |
106 | ||
107 | result = bread (super -> s_dev, n_block, n_size); | |
108 | PROC_INFO_INC( super, breads ); | |
109 | - PROC_EXP( if( kstat.context_swtch != ctx_switches ) | |
110 | + PROC_EXP( if( nr_context_switches() != ctx_switches ) | |
111 | PROC_INFO_INC( super, bread_miss ) ); | |
112 | return result; | |
113 | } | |
114 | --- linux/fs/reiserfs/journal.c.orig Tue Feb 5 13:51:53 2002 | |
115 | +++ linux/fs/reiserfs/journal.c Tue Feb 5 13:52:12 2002 | |
116 | @@ -149,8 +149,7 @@ | |
117 | } | |
118 | bn = allocate_bitmap_node(p_s_sb) ; | |
119 | if (!bn) { | |
120 | - current->policy |= SCHED_YIELD ; | |
121 | - schedule() ; | |
122 | + yield(); | |
123 | goto repeat ; | |
124 | } | |
125 | return bn ; | |
126 | --- linux/fs/jffs2/background.c.orig Tue Feb 5 13:51:47 2002 | |
127 | +++ linux/fs/jffs2/background.c Tue Feb 5 13:52:12 2002 | |
128 | @@ -106,9 +106,6 @@ | |
129 | ||
130 | sprintf(current->comm, "jffs2_gcd_mtd%d", c->mtd->index); | |
131 | ||
132 | - /* FIXME in the 2.2 backport */ | |
133 | - current->nice = 10; | |
134 | - | |
135 | for (;;) { | |
136 | spin_lock_irq(¤t->sigmask_lock); | |
137 | siginitsetinv (¤t->blocked, sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT)); | |
138 | --- linux/fs/jbd/journal.c.orig Tue Feb 5 13:51:53 2002 | |
139 | +++ linux/fs/jbd/journal.c Tue Feb 5 13:52:12 2002 | |
140 | @@ -460,8 +460,7 @@ | |
141 | printk (KERN_NOTICE __FUNCTION__ | |
142 | ": ENOMEM at get_unused_buffer_head, " | |
143 | "trying again.\n"); | |
144 | - current->policy |= SCHED_YIELD; | |
145 | - schedule(); | |
146 | + yield(); | |
147 | } | |
148 | } while (!new_bh); | |
149 | /* keep subsequent assertions sane */ | |
150 | @@ -1541,8 +1540,7 @@ | |
151 | last_warning = jiffies; | |
152 | } | |
153 | ||
154 | - current->policy |= SCHED_YIELD; | |
155 | - schedule(); | |
156 | + yield(); | |
157 | } | |
158 | } | |
159 | ||
160 | @@ -1600,8 +1598,7 @@ | |
161 | last_warning = jiffies; | |
162 | } | |
163 | while (ret == 0) { | |
164 | - current->policy |= SCHED_YIELD; | |
165 | - schedule(); | |
166 | + yield(); | |
167 | ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); | |
168 | } | |
169 | } | |
170 | --- linux/fs/jbd/revoke.c.orig Tue Feb 5 13:51:53 2002 | |
171 | +++ linux/fs/jbd/revoke.c Tue Feb 5 13:52:12 2002 | |
172 | @@ -137,8 +137,7 @@ | |
173 | if (!journal_oom_retry) | |
174 | return -ENOMEM; | |
175 | jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n"); | |
176 | - current->policy |= SCHED_YIELD; | |
177 | - schedule(); | |
178 | + yield(); | |
179 | goto repeat; | |
180 | } | |
181 | ||
182 | --- linux/fs/jbd/transaction.c.orig Tue Feb 5 13:51:53 2002 | |
183 | +++ linux/fs/jbd/transaction.c Tue Feb 5 13:52:12 2002 | |
184 | @@ -1379,8 +1379,7 @@ | |
185 | do { | |
186 | old_handle_count = transaction->t_handle_count; | |
187 | set_current_state(TASK_RUNNING); | |
188 | - current->policy |= SCHED_YIELD; | |
189 | - schedule(); | |
190 | + yield(); | |
191 | } while (old_handle_count != transaction->t_handle_count); | |
192 | } | |
193 | ||
194 | --- linux/fs/binfmt_elf.c.orig Tue Feb 5 13:51:53 2002 | |
195 | +++ linux/fs/binfmt_elf.c Tue Feb 5 13:52:12 2002 | |
196 | @@ -1135,7 +1135,7 @@ | |
197 | psinfo.pr_state = i; | |
198 | psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i]; | |
199 | psinfo.pr_zomb = psinfo.pr_sname == 'Z'; | |
200 | - psinfo.pr_nice = current->nice; | |
201 | + psinfo.pr_nice = task_nice(current); | |
202 | psinfo.pr_flag = current->flags; | |
203 | psinfo.pr_uid = NEW_TO_OLD_UID(current->uid); | |
204 | psinfo.pr_gid = NEW_TO_OLD_GID(current->gid); | |
205 | --- linux/fs/buffer.c.orig Tue Feb 5 13:51:53 2002 | |
206 | +++ linux/fs/buffer.c Tue Feb 5 13:52:12 2002 | |
207 | @@ -735,9 +735,8 @@ | |
208 | wakeup_bdflush(); | |
209 | try_to_free_pages(zone, GFP_NOFS, 0); | |
210 | run_task_queue(&tq_disk); | |
211 | - current->policy |= SCHED_YIELD; | |
212 | __set_current_state(TASK_RUNNING); | |
213 | - schedule(); | |
214 | + sys_sched_yield(); | |
215 | } | |
216 | ||
217 | void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) | |
218 | --- linux/fs/locks.c.orig Tue Feb 5 13:51:45 2002 | |
219 | +++ linux/fs/locks.c Tue Feb 5 13:52:12 2002 | |
220 | @@ -445,8 +445,7 @@ | |
221 | /* Let the blocked process remove waiter from the | |
222 | * block list when it gets scheduled. | |
223 | */ | |
224 | - current->policy |= SCHED_YIELD; | |
225 | - schedule(); | |
226 | + yield(); | |
227 | } else { | |
228 | /* Remove waiter from the block list, because by the | |
229 | * time it wakes up blocker won't exist any more. | |
230 | --- linux/init/main.c.orig Tue Feb 5 13:51:53 2002 | |
231 | +++ linux/init/main.c Tue Feb 5 13:52:12 2002 | |
232 | @@ -485,8 +485,6 @@ | |
233 | extern void setup_arch(char **); | |
234 | extern void cpu_idle(void); | |
235 | ||
236 | -unsigned long wait_init_idle; | |
237 | - | |
238 | #ifndef CONFIG_SMP | |
239 | ||
240 | #ifdef CONFIG_X86_LOCAL_APIC | |
241 | @@ -495,34 +493,24 @@ | |
242 | APIC_init_uniprocessor(); | |
243 | } | |
244 | #else | |
245 | -#define smp_init() do { } while (0) | |
246 | +#define smp_init() do { } while (0) | |
247 | #endif | |
248 | ||
249 | #else | |
250 | ||
251 | - | |
252 | /* Called by boot processor to activate the rest. */ | |
253 | static void __init smp_init(void) | |
254 | { | |
255 | /* Get other processors into their bootup holding patterns. */ | |
256 | smp_boot_cpus(); | |
257 | - wait_init_idle = cpu_online_map; | |
258 | - clear_bit(current->processor, &wait_init_idle); /* Don't wait on me! */ | |
259 | ||
260 | smp_threads_ready=1; | |
261 | smp_commence(); | |
262 | - | |
263 | - /* Wait for the other cpus to set up their idle processes */ | |
264 | - printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle); | |
265 | - while (wait_init_idle) { | |
266 | - cpu_relax(); | |
267 | - barrier(); | |
268 | - } | |
269 | - printk("All processors have done init_idle\n"); | |
270 | } | |
271 | ||
272 | #endif | |
273 | ||
274 | + | |
275 | /* | |
276 | * We need to finalize in a non-__init function or else race conditions | |
277 | * between the root thread and the init thread may cause start_kernel to | |
278 | @@ -534,9 +522,8 @@ | |
279 | { | |
280 | kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); | |
281 | unlock_kernel(); | |
282 | - current->need_resched = 1; | |
283 | - cpu_idle(); | |
284 | -} | |
285 | + cpu_idle(); | |
286 | +} | |
287 | ||
288 | /* | |
289 | * Activate the first processor. | |
290 | @@ -617,14 +604,18 @@ | |
291 | ipc_init(); | |
292 | #endif | |
293 | check_bugs(); | |
294 | + | |
295 | printk("POSIX conformance testing by UNIFIX\n"); | |
296 | ||
297 | - /* | |
298 | - * We count on the initial thread going ok | |
299 | - * Like idlers init is an unlocked kernel thread, which will | |
300 | - * make syscalls (and thus be locked). | |
301 | + init_idle(current, smp_processor_id()); | |
302 | + /* | |
303 | + * We count on the initial thread going ok | |
304 | + * Like idlers init is an unlocked kernel thread, which will | |
305 | + * make syscalls (and thus be locked). | |
306 | */ | |
307 | smp_init(); | |
308 | + | |
309 | + /* Do the rest non-__init'ed, we're now alive */ | |
310 | rest_init(); | |
311 | } | |
312 | ||
313 | @@ -785,12 +776,9 @@ | |
314 | int i, pid; | |
315 | ||
316 | pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD); | |
317 | - if (pid > 0) { | |
318 | - while (pid != wait(&i)) { | |
319 | - current->policy |= SCHED_YIELD; | |
320 | - schedule(); | |
321 | - } | |
322 | - } | |
323 | + if (pid > 0) | |
324 | + while (pid != wait(&i)) | |
325 | + yield(); | |
326 | if (MAJOR(real_root_dev) != RAMDISK_MAJOR | |
327 | || MINOR(real_root_dev) != 0) { | |
328 | error = change_root(real_root_dev,"/initrd"); | |
329 | --- linux/kernel/sched.c.orig Tue Feb 5 13:51:51 2002 | |
330 | +++ linux/kernel/sched.c Tue Feb 5 13:52:12 2002 | |
331 | @@ -12,333 +12,306 @@ | |
332 | * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar | |
333 | */ | |
334 | ||
335 | -/* | |
336 | - * 'sched.c' is the main kernel file. It contains scheduling primitives | |
337 | - * (sleep_on, wakeup, schedule etc) as well as a number of simple system | |
338 | - * call functions (type getpid()), which just extract a field from | |
339 | - * current-task | |
340 | - */ | |
341 | - | |
342 | -#include <linux/config.h> | |
343 | #include <linux/mm.h> | |
344 | +#include <linux/nmi.h> | |
345 | #include <linux/init.h> | |
346 | +#include <asm/uaccess.h> | |
347 | #include <linux/smp_lock.h> | |
348 | -#include <linux/nmi.h> | |
349 | #include <linux/interrupt.h> | |
350 | -#include <linux/kernel_stat.h> | |
351 | -#include <linux/completion.h> | |
352 | -#include <linux/prefetch.h> | |
353 | -#include <linux/compiler.h> | |
354 | - | |
355 | -#include <asm/uaccess.h> | |
356 | #include <asm/mmu_context.h> | |
357 | - | |
358 | -extern void timer_bh(void); | |
359 | -extern void tqueue_bh(void); | |
360 | -extern void immediate_bh(void); | |
361 | +#include <linux/kernel_stat.h> | |
362 | ||
363 | /* | |
364 | - * scheduler variables | |
365 | + * Priority of a process goes from 0 to 139. The 0-99 | |
366 | + * priority range is allocated to RT tasks, the 100-139 | |
367 | + * range is for SCHED_OTHER tasks. Priority values are | |
368 | + * inverted: lower p->prio value means higher priority. | |
369 | */ | |
370 | - | |
371 | -unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ | |
372 | - | |
373 | -extern void mem_use(void); | |
374 | +#define MAX_RT_PRIO 100 | |
375 | +#define MAX_PRIO (MAX_RT_PRIO + 40) | |
376 | ||
377 | /* | |
378 | - * Scheduling quanta. | |
379 | - * | |
380 | - * NOTE! The unix "nice" value influences how long a process | |
381 | - * gets. The nice value ranges from -20 to +19, where a -20 | |
382 | - * is a "high-priority" task, and a "+10" is a low-priority | |
383 | - * task. | |
384 | - * | |
385 | - * We want the time-slice to be around 50ms or so, so this | |
386 | - * calculation depends on the value of HZ. | |
387 | + * Convert user-nice values [ -20 ... 0 ... 19 ] | |
388 | + * to static priority [ 100 ... 139 (MAX_PRIO-1) ], | |
389 | + * and back. | |
390 | */ | |
391 | -#if HZ < 200 | |
392 | -#define TICK_SCALE(x) ((x) >> 2) | |
393 | -#elif HZ < 400 | |
394 | -#define TICK_SCALE(x) ((x) >> 1) | |
395 | -#elif HZ < 800 | |
396 | -#define TICK_SCALE(x) (x) | |
397 | -#elif HZ < 1600 | |
398 | -#define TICK_SCALE(x) ((x) << 1) | |
399 | -#else | |
400 | -#define TICK_SCALE(x) ((x) << 2) | |
401 | -#endif | |
402 | - | |
403 | -#define NICE_TO_TICKS(nice) (TICK_SCALE(20-(nice))+1) | |
404 | +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) | |
405 | +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) | |
406 | +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) | |
407 | ||
408 | +/* | |
409 | + * 'User priority' is the nice value converted to something we | |
410 | + * can work with better when scaling various scheduler parameters, | |
411 | + * it's a [ 0 ... 39 ] range. | |
412 | + */ | |
413 | +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) | |
414 | +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) | |
415 | +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | |
416 | ||
417 | /* | |
418 | - * Init task must be ok at boot for the ix86 as we will check its signals | |
419 | - * via the SMP irq return path. | |
420 | + * These are the 'tuning knobs' of the scheduler: | |
421 | + * | |
422 | + * Minimum timeslice is 10 msecs, default timeslice is 150 msecs, | |
423 | + * maximum timeslice is 300 msecs. Timeslices get refilled after | |
424 | + * they expire. | |
425 | */ | |
426 | - | |
427 | -struct task_struct * init_tasks[NR_CPUS] = {&init_task, }; | |
428 | +#define MIN_TIMESLICE ( 10 * HZ / 1000) | |
429 | +#define MAX_TIMESLICE (300 * HZ / 1000) | |
430 | +#define CHILD_PENALTY 95 | |
431 | +#define PARENT_PENALTY 100 | |
432 | +#define EXIT_WEIGHT 3 | |
433 | +#define PRIO_BONUS_RATIO 25 | |
434 | +#define INTERACTIVE_DELTA 2 | |
435 | +#define MAX_SLEEP_AVG (2*HZ) | |
436 | +#define STARVATION_LIMIT (2*HZ) | |
437 | ||
438 | /* | |
439 | - * The tasklist_lock protects the linked list of processes. | |
440 | + * If a task is 'interactive' then we reinsert it in the active | |
441 | + * array after it has expired its current timeslice. (it will not | |
442 | + * continue to run immediately, it will still roundrobin with | |
443 | + * other interactive tasks.) | |
444 | * | |
445 | - * The runqueue_lock locks the parts that actually access | |
446 | - * and change the run-queues, and have to be interrupt-safe. | |
447 | + * This part scales the interactivity limit depending on niceness. | |
448 | * | |
449 | - * If both locks are to be concurrently held, the runqueue_lock | |
450 | - * nests inside the tasklist_lock. | |
451 | + * We scale it linearly, offset by the INTERACTIVE_DELTA delta. | |
452 | + * Here are a few examples of different nice levels: | |
453 | * | |
454 | - * task->alloc_lock nests inside tasklist_lock. | |
455 | + * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] | |
456 | + * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] | |
457 | + * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] | |
458 | + * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] | |
459 | + * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] | |
460 | + * | |
461 | + * (the X axis represents the possible -5 ... 0 ... +5 dynamic | |
462 | + * priority range a task can explore, a value of '1' means the | |
463 | + * task is rated interactive.) | |
464 | + * | |
465 | + * Ie. nice +19 tasks can never get 'interactive' enough to be | |
466 | + * reinserted into the active array. And only heavily CPU-hog nice -20 | |
467 | + * tasks will be expired. Default nice 0 tasks are somewhere between, | |
468 | + * it takes some effort for them to get interactive, but it's not | |
469 | + * too hard. | |
470 | */ | |
471 | -spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */ | |
472 | -rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ | |
473 | ||
474 | -static LIST_HEAD(runqueue_head); | |
475 | +#define SCALE(v1,v1_max,v2_max) \ | |
476 | + (v1) * (v2_max) / (v1_max) | |
477 | ||
478 | -/* | |
479 | - * We align per-CPU scheduling data on cacheline boundaries, | |
480 | - * to prevent cacheline ping-pong. | |
481 | - */ | |
482 | -static union { | |
483 | - struct schedule_data { | |
484 | - struct task_struct * curr; | |
485 | - cycles_t last_schedule; | |
486 | - } schedule_data; | |
487 | - char __pad [SMP_CACHE_BYTES]; | |
488 | -} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}}; | |
489 | +#define DELTA(p) \ | |
490 | + (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \ | |
491 | + INTERACTIVE_DELTA) | |
492 | ||
493 | -#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr | |
494 | -#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule | |
495 | +#define TASK_INTERACTIVE(p) \ | |
496 | + ((p)->prio <= (p)->static_prio - DELTA(p)) | |
497 | ||
498 | -struct kernel_stat kstat; | |
499 | -extern struct task_struct *child_reaper; | |
500 | +/* | |
501 | + * TASK_TIMESLICE scales user-nice values [ -20 ... 19 ] | |
502 | + * to time slice values. | |
503 | + * | |
504 | + * The higher a process's priority, the bigger timeslices | |
505 | + * it gets during one round of execution. But even the lowest | |
506 | + * priority process gets MIN_TIMESLICE worth of execution time. | |
507 | + */ | |
508 | ||
509 | -#ifdef CONFIG_SMP | |
510 | +#define TASK_TIMESLICE(p) (MIN_TIMESLICE + \ | |
511 | + ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/39)) | |
512 | ||
513 | -#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)]) | |
514 | -#define can_schedule(p,cpu) \ | |
515 | - ((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu)) | |
516 | +/* | |
517 | + * These are the runqueue data structures: | |
518 | + */ | |
519 | ||
520 | -#else | |
521 | +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) | |
522 | ||
523 | -#define idle_task(cpu) (&init_task) | |
524 | -#define can_schedule(p,cpu) (1) | |
525 | +typedef struct runqueue runqueue_t; | |
526 | ||
527 | -#endif | |
528 | - | |
529 | -void scheduling_functions_start_here(void) { } | |
530 | +struct prio_array { | |
531 | + int nr_active; | |
532 | + spinlock_t *lock; | |
533 | + runqueue_t *rq; | |
534 | + unsigned long bitmap[BITMAP_SIZE]; | |
535 | + list_t queue[MAX_PRIO]; | |
536 | +}; | |
537 | ||
538 | /* | |
539 | - * This is the function that decides how desirable a process is.. | |
540 | - * You can weigh different processes against each other depending | |
541 | - * on what CPU they've run on lately etc to try to handle cache | |
542 | - * and TLB miss penalties. | |
543 | + * This is the main, per-CPU runqueue data structure. | |
544 | * | |
545 | - * Return values: | |
546 | - * -1000: never select this | |
547 | - * 0: out of time, recalculate counters (but it might still be | |
548 | - * selected) | |
549 | - * +ve: "goodness" value (the larger, the better) | |
550 | - * +1000: realtime process, select this. | |
551 | + * Locking rule: those places that want to lock multiple runqueues | |
552 | + * (such as the load balancing or the process migration code), lock | |
553 | + * acquire operations must be ordered by ascending &runqueue. | |
554 | */ | |
555 | +struct runqueue { | |
556 | + spinlock_t lock; | |
557 | + unsigned long nr_running, nr_switches, expired_timestamp; | |
558 | + task_t *curr, *idle; | |
559 | + prio_array_t *active, *expired, arrays[2]; | |
560 | + int prev_nr_running[NR_CPUS]; | |
561 | +} ____cacheline_aligned; | |
562 | ||
563 | -static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm) | |
564 | -{ | |
565 | - int weight; | |
566 | +static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; | |
567 | ||
568 | - /* | |
569 | - * select the current process after every other | |
570 | - * runnable process, but before the idle thread. | |
571 | - * Also, dont trigger a counter recalculation. | |
572 | - */ | |
573 | - weight = -1; | |
574 | - if (p->policy & SCHED_YIELD) | |
575 | - goto out; | |
576 | +#define cpu_rq(cpu) (runqueues + (cpu)) | |
577 | +#define this_rq() cpu_rq(smp_processor_id()) | |
578 | +#define task_rq(p) cpu_rq((p)->cpu) | |
579 | +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) | |
580 | +#define rt_task(p) ((p)->prio < MAX_RT_PRIO) | |
581 | ||
582 | - /* | |
583 | - * Non-RT process - normal case first. | |
584 | - */ | |
585 | - if (p->policy == SCHED_OTHER) { | |
586 | - /* | |
587 | - * Give the process a first-approximation goodness value | |
588 | - * according to the number of clock-ticks it has left. | |
589 | - * | |
590 | - * Don't do any other calculations if the time slice is | |
591 | - * over.. | |
592 | - */ | |
593 | - weight = p->counter; | |
594 | - if (!weight) | |
595 | - goto out; | |
596 | - | |
597 | -#ifdef CONFIG_SMP | |
598 | - /* Give a largish advantage to the same processor... */ | |
599 | - /* (this is equivalent to penalizing other processors) */ | |
600 | - if (p->processor == this_cpu) | |
601 | - weight += PROC_CHANGE_PENALTY; | |
602 | -#endif | |
603 | +static inline runqueue_t *lock_task_rq(task_t *p, unsigned long *flags) | |
604 | +{ | |
605 | + struct runqueue *__rq; | |
606 | ||
607 | - /* .. and a slight advantage to the current MM */ | |
608 | - if (p->mm == this_mm || !p->mm) | |
609 | - weight += 1; | |
610 | - weight += 20 - p->nice; | |
611 | - goto out; | |
612 | +repeat_lock_task: | |
613 | + __rq = task_rq(p); | |
614 | + spin_lock_irqsave(&__rq->lock, *flags); | |
615 | + if (unlikely(__rq != task_rq(p))) { | |
616 | + spin_unlock_irqrestore(&__rq->lock, *flags); | |
617 | + goto repeat_lock_task; | |
618 | } | |
619 | + return __rq; | |
620 | +} | |
621 | ||
622 | - /* | |
623 | - * Realtime process, select the first one on the | |
624 | - * runqueue (taking priorities within processes | |
625 | - * into account). | |
626 | - */ | |
627 | - weight = 1000 + p->rt_priority; | |
628 | -out: | |
629 | - return weight; | |
630 | +static inline void unlock_task_rq(runqueue_t *rq, unsigned long *flags) | |
631 | +{ | |
632 | + spin_unlock_irqrestore(&rq->lock, *flags); | |
633 | } | |
634 | ||
635 | /* | |
636 | - * the 'goodness value' of replacing a process on a given CPU. | |
637 | - * positive value means 'replace', zero or negative means 'dont'. | |
638 | + * Adding/removing a task to/from a priority array: | |
639 | */ | |
640 | -static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu) | |
641 | +static inline void dequeue_task(struct task_struct *p, prio_array_t *array) | |
642 | { | |
643 | - return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm); | |
644 | + array->nr_active--; | |
645 | + list_del_init(&p->run_list); | |
646 | + if (list_empty(array->queue + p->prio)) | |
647 | + __clear_bit(p->prio, array->bitmap); | |
648 | } | |
649 | ||
650 | -/* | |
651 | - * This is ugly, but reschedule_idle() is very timing-critical. | |
652 | - * We are called with the runqueue spinlock held and we must | |
653 | - * not claim the tasklist_lock. | |
654 | - */ | |
655 | -static FASTCALL(void reschedule_idle(struct task_struct * p)); | |
656 | +static inline void enqueue_task(struct task_struct *p, prio_array_t *array) | |
657 | +{ | |
658 | + list_add_tail(&p->run_list, array->queue + p->prio); | |
659 | + __set_bit(p->prio, array->bitmap); | |
660 | + array->nr_active++; | |
661 | + p->array = array; | |
662 | +} | |
663 | ||
664 | -static void reschedule_idle(struct task_struct * p) | |
665 | +static inline int effective_prio(task_t *p) | |
666 | { | |
667 | -#ifdef CONFIG_SMP | |
668 | - int this_cpu = smp_processor_id(); | |
669 | - struct task_struct *tsk, *target_tsk; | |
670 | - int cpu, best_cpu, i, max_prio; | |
671 | - cycles_t oldest_idle; | |
672 | - | |
673 | - /* | |
674 | - * shortcut if the woken up task's last CPU is | |
675 | - * idle now. | |
676 | - */ | |
677 | - best_cpu = p->processor; | |
678 | - if (can_schedule(p, best_cpu)) { | |
679 | - tsk = idle_task(best_cpu); | |
680 | - if (cpu_curr(best_cpu) == tsk) { | |
681 | - int need_resched; | |
682 | -send_now_idle: | |
683 | - /* | |
684 | - * If need_resched == -1 then we can skip sending | |
685 | - * the IPI altogether, tsk->need_resched is | |
686 | - * actively watched by the idle thread. | |
687 | - */ | |
688 | - need_resched = tsk->need_resched; | |
689 | - tsk->need_resched = 1; | |
690 | - if ((best_cpu != this_cpu) && !need_resched) | |
691 | - smp_send_reschedule(best_cpu); | |
692 | - return; | |
693 | - } | |
694 | - } | |
695 | + int bonus, prio; | |
696 | ||
697 | /* | |
698 | - * We know that the preferred CPU has a cache-affine current | |
699 | - * process, lets try to find a new idle CPU for the woken-up | |
700 | - * process. Select the least recently active idle CPU. (that | |
701 | - * one will have the least active cache context.) Also find | |
702 | - * the executing process which has the least priority. | |
703 | - */ | |
704 | - oldest_idle = (cycles_t) -1; | |
705 | - target_tsk = NULL; | |
706 | - max_prio = 0; | |
707 | + * Here we scale the actual sleep average [0 .... MAX_SLEEP_AVG] | |
708 | + * into the -5 ... 0 ... +5 bonus/penalty range. | |
709 | + * | |
710 | + * We use 25% of the full 0...39 priority range so that: | |
711 | + * | |
712 | + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. | |
713 | + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. | |
714 | + * | |
715 | + * Both properties are important to certain workloads. | |
716 | + */ | |
717 | + bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 - | |
718 | + MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2; | |
719 | ||
720 | - for (i = 0; i < smp_num_cpus; i++) { | |
721 | - cpu = cpu_logical_map(i); | |
722 | - if (!can_schedule(p, cpu)) | |
723 | - continue; | |
724 | - tsk = cpu_curr(cpu); | |
725 | + prio = p->static_prio - bonus; | |
726 | + if (prio < MAX_RT_PRIO) | |
727 | + prio = MAX_RT_PRIO; | |
728 | + if (prio > MAX_PRIO-1) | |
729 | + prio = MAX_PRIO-1; | |
730 | + return prio; | |
731 | +} | |
732 | + | |
733 | +static inline void activate_task(task_t *p, runqueue_t *rq) | |
734 | +{ | |
735 | + unsigned long sleep_time = jiffies - p->sleep_timestamp; | |
736 | + prio_array_t *array = rq->active; | |
737 | + | |
738 | + if (!rt_task(p) && sleep_time) { | |
739 | /* | |
740 | - * We use the first available idle CPU. This creates | |
741 | - * a priority list between idle CPUs, but this is not | |
742 | - * a problem. | |
743 | + * This code gives a bonus to interactive tasks. We update | |
744 | + * an 'average sleep time' value here, based on | |
745 | + * sleep_timestamp. The more time a task spends sleeping, | |
746 | + * the higher the average gets - and the higher the priority | |
747 | + * boost gets as well. | |
748 | */ | |
749 | - if (tsk == idle_task(cpu)) { | |
750 | -#if defined(__i386__) && defined(CONFIG_SMP) | |
751 | - /* | |
752 | - * Check if two siblings are idle in the same | |
753 | - * physical package. Use them if found. | |
754 | - */ | |
755 | - if (smp_num_siblings == 2) { | |
756 | - if (cpu_curr(cpu_sibling_map[cpu]) == | |
757 | - idle_task(cpu_sibling_map[cpu])) { | |
758 | - oldest_idle = last_schedule(cpu); | |
759 | - target_tsk = tsk; | |
760 | - break; | |
761 | - } | |
762 | - | |
763 | - } | |
764 | -#endif | |
765 | - if (last_schedule(cpu) < oldest_idle) { | |
766 | - oldest_idle = last_schedule(cpu); | |
767 | - target_tsk = tsk; | |
768 | - } | |
769 | - } else { | |
770 | - if (oldest_idle == -1ULL) { | |
771 | - int prio = preemption_goodness(tsk, p, cpu); | |
772 | - | |
773 | - if (prio > max_prio) { | |
774 | - max_prio = prio; | |
775 | - target_tsk = tsk; | |
776 | - } | |
777 | - } | |
778 | - } | |
779 | - } | |
780 | - tsk = target_tsk; | |
781 | - if (tsk) { | |
782 | - if (oldest_idle != -1ULL) { | |
783 | - best_cpu = tsk->processor; | |
784 | - goto send_now_idle; | |
785 | - } | |
786 | - tsk->need_resched = 1; | |
787 | - if (tsk->processor != this_cpu) | |
788 | - smp_send_reschedule(tsk->processor); | |
789 | + p->sleep_avg += sleep_time; | |
790 | + if (p->sleep_avg > MAX_SLEEP_AVG) | |
791 | + p->sleep_avg = MAX_SLEEP_AVG; | |
792 | + p->prio = effective_prio(p); | |
793 | } | |
794 | - return; | |
795 | - | |
796 | + enqueue_task(p, array); | |
797 | + rq->nr_running++; | |
798 | +} | |
799 | ||
800 | -#else /* UP */ | |
801 | - int this_cpu = smp_processor_id(); | |
802 | - struct task_struct *tsk; | |
803 | - | |
804 | - tsk = cpu_curr(this_cpu); | |
805 | - if (preemption_goodness(tsk, p, this_cpu) > 0) | |
806 | - tsk->need_resched = 1; | |
807 | -#endif | |
808 | +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) | |
809 | +{ | |
810 | + rq->nr_running--; | |
811 | + dequeue_task(p, p->array); | |
812 | + p->array = NULL; | |
813 | } | |
814 | ||
815 | +static inline void resched_task(task_t *p) | |
816 | +{ | |
817 | + int need_resched; | |
818 | + | |
819 | + need_resched = p->need_resched; | |
820 | + wmb(); | |
821 | + p->need_resched = 1; | |
822 | + if (!need_resched && (p->cpu != smp_processor_id())) | |
823 | + smp_send_reschedule(p->cpu); | |
824 | +} | |
825 | + | |
826 | +#ifdef CONFIG_SMP | |
827 | + | |
828 | /* | |
829 | - * Careful! | |
830 | - * | |
831 | - * This has to add the process to the _beginning_ of the | |
832 | - * run-queue, not the end. See the comment about "This is | |
833 | - * subtle" in the scheduler proper.. | |
834 | + * Wait for a process to unschedule. This is used by the exit() and | |
835 | + * ptrace() code. | |
836 | */ | |
837 | -static inline void add_to_runqueue(struct task_struct * p) | |
838 | +void wait_task_inactive(task_t * p) | |
839 | { | |
840 | - list_add(&p->run_list, &runqueue_head); | |
841 | - nr_running++; | |
842 | + unsigned long flags; | |
843 | + runqueue_t *rq; | |
844 | + | |
845 | +repeat: | |
846 | + rq = task_rq(p); | |
847 | + while (unlikely(rq->curr == p)) { | |
848 | + cpu_relax(); | |
849 | + barrier(); | |
850 | + } | |
851 | + rq = lock_task_rq(p, &flags); | |
852 | + if (unlikely(rq->curr == p)) { | |
853 | + unlock_task_rq(rq, &flags); | |
854 | + goto repeat; | |
855 | + } | |
856 | + unlock_task_rq(rq, &flags); | |
857 | } | |
858 | ||
859 | -static inline void move_last_runqueue(struct task_struct * p) | |
860 | +/* | |
861 | + * The SMP message passing code calls this function whenever | |
862 | + * the new task has arrived at the target CPU. We move the | |
863 | + * new task into the local runqueue. | |
864 | + * | |
865 | + * This function must be called with interrupts disabled. | |
866 | + */ | |
867 | +void sched_task_migrated(task_t *new_task) | |
868 | { | |
869 | - list_del(&p->run_list); | |
870 | - list_add_tail(&p->run_list, &runqueue_head); | |
871 | + wait_task_inactive(new_task); | |
872 | + new_task->cpu = smp_processor_id(); | |
873 | + wake_up_process(new_task); | |
874 | } | |
875 | ||
876 | -static inline void move_first_runqueue(struct task_struct * p) | |
877 | +/* | |
878 | + * Kick the remote CPU if the task is running currently, | |
879 | + * this code is used by the signal code to signal tasks | |
880 | + * which are in user-mode as quickly as possible. | |
881 | + * | |
882 | + * (Note that we do this lockless - if the task does anything | |
883 | + * while the message is in flight then it will notice the | |
884 | + * sigpending condition anyway.) | |
885 | + */ | |
886 | +void kick_if_running(task_t * p) | |
887 | { | |
888 | - list_del(&p->run_list); | |
889 | - list_add(&p->run_list, &runqueue_head); | |
890 | + if (p == task_rq(p)->curr) | |
891 | + resched_task(p); | |
892 | } | |
893 | +#endif | |
894 | ||
895 | /* | |
896 | * Wake up a process. Put it on the run-queue if it's not | |
897 | @@ -348,392 +321,528 @@ | |
898 | * "current->state = TASK_RUNNING" to mark yourself runnable | |
899 | * without the overhead of this. | |
900 | */ | |
901 | -static inline int try_to_wake_up(struct task_struct * p, int synchronous) | |
902 | +static int try_to_wake_up(task_t * p, int synchronous) | |
903 | { | |
904 | unsigned long flags; | |
905 | int success = 0; | |
906 | + runqueue_t *rq; | |
907 | ||
908 | - /* | |
909 | - * We want the common case fall through straight, thus the goto. | |
910 | - */ | |
911 | - spin_lock_irqsave(&runqueue_lock, flags); | |
912 | + rq = lock_task_rq(p, &flags); | |
913 | p->state = TASK_RUNNING; | |
914 | - if (task_on_runqueue(p)) | |
915 | - goto out; | |
916 | - add_to_runqueue(p); | |
917 | - if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id()))) | |
918 | - reschedule_idle(p); | |
919 | - success = 1; | |
920 | -out: | |
921 | - spin_unlock_irqrestore(&runqueue_lock, flags); | |
922 | + if (!p->array) { | |
923 | + activate_task(p, rq); | |
924 | + if ((rq->curr == rq->idle) || (p->prio < rq->curr->prio)) | |
925 | + resched_task(rq->curr); | |
926 | + success = 1; | |
927 | + } | |
928 | + unlock_task_rq(rq, &flags); | |
929 | return success; | |
930 | } | |
931 | ||
932 | -inline int wake_up_process(struct task_struct * p) | |
933 | +int wake_up_process(task_t * p) | |
934 | { | |
935 | return try_to_wake_up(p, 0); | |
936 | } | |
937 | ||
938 | -static void process_timeout(unsigned long __data) | |
939 | +void wake_up_forked_process(task_t * p) | |
940 | { | |
941 | - struct task_struct * p = (struct task_struct *) __data; | |
942 | + runqueue_t *rq = this_rq(); | |
943 | ||
944 | - wake_up_process(p); | |
945 | + p->state = TASK_RUNNING; | |
946 | + if (!rt_task(p)) { | |
947 | + /* | |
948 | + * We decrease the sleep average of forking parents | |
949 | + * and children as well, to keep max-interactive tasks | |
950 | + * from forking tasks that are max-interactive. | |
951 | + */ | |
952 | + current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100; | |
953 | + p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100; | |
954 | + p->prio = effective_prio(p); | |
955 | + } | |
956 | + spin_lock_irq(&rq->lock); | |
957 | + p->cpu = smp_processor_id(); | |
958 | + activate_task(p, rq); | |
959 | + spin_unlock_irq(&rq->lock); | |
960 | } | |
961 | ||
962 | -/** | |
963 | - * schedule_timeout - sleep until timeout | |
964 | - * @timeout: timeout value in jiffies | |
965 | - * | |
966 | - * Make the current task sleep until @timeout jiffies have | |
967 | - * elapsed. The routine will return immediately unless | |
968 | - * the current task state has been set (see set_current_state()). | |
969 | - * | |
970 | - * You can set the task state as follows - | |
971 | - * | |
972 | - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to | |
973 | - * pass before the routine returns. The routine will return 0 | |
974 | - * | |
975 | - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is | |
976 | - * delivered to the current task. In this case the remaining time | |
977 | - * in jiffies will be returned, or 0 if the timer expired in time | |
978 | - * | |
979 | - * The current task state is guaranteed to be TASK_RUNNING when this | |
980 | - * routine returns. | |
981 | - * | |
982 | - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule | |
983 | - * the CPU away without a bound on the timeout. In this case the return | |
984 | - * value will be %MAX_SCHEDULE_TIMEOUT. | |
985 | +/* | |
986 | + * Potentially available exiting-child timeslices are | |
987 | + * retrieved here - this way the parent does not get | |
988 | + * penalized for creating too many processes. | |
989 | * | |
990 | - * In all cases the return value is guaranteed to be non-negative. | |
991 | + * (this cannot be used to 'generate' timeslices | |
992 | + * artificially, because any timeslice recovered here | |
993 | + * was given away by the parent in the first place.) | |
994 | */ | |
995 | -signed long schedule_timeout(signed long timeout) | |
996 | +void sched_exit(task_t * p) | |
997 | { | |
998 | - struct timer_list timer; | |
999 | - unsigned long expire; | |
1000 | + __cli(); | |
1001 | + current->time_slice += p->time_slice; | |
1002 | + if (unlikely(current->time_slice > MAX_TIMESLICE)) | |
1003 | + current->time_slice = MAX_TIMESLICE; | |
1004 | + __sti(); | |
1005 | + /* | |
1006 | + * If the child was a (relative-) CPU hog then decrease | |
1007 | + * the sleep_avg of the parent as well. | |
1008 | + */ | |
1009 | + if (p->sleep_avg < current->sleep_avg) | |
1010 | + current->sleep_avg = (current->sleep_avg * EXIT_WEIGHT + | |
1011 | + p->sleep_avg) / (EXIT_WEIGHT + 1); | |
1012 | +} | |
1013 | ||
1014 | - switch (timeout) | |
1015 | - { | |
1016 | - case MAX_SCHEDULE_TIMEOUT: | |
1017 | - /* | |
1018 | - * These two special cases are useful to be comfortable | |
1019 | - * in the caller. Nothing more. We could take | |
1020 | - * MAX_SCHEDULE_TIMEOUT from one of the negative value | |
1021 | - * but I' d like to return a valid offset (>=0) to allow | |
1022 | - * the caller to do everything it want with the retval. | |
1023 | - */ | |
1024 | - schedule(); | |
1025 | - goto out; | |
1026 | - default: | |
1027 | - /* | |
1028 | - * Another bit of PARANOID. Note that the retval will be | |
1029 | - * 0 since no piece of kernel is supposed to do a check | |
1030 | - * for a negative retval of schedule_timeout() (since it | |
1031 | - * should never happens anyway). You just have the printk() | |
1032 | - * that will tell you if something is gone wrong and where. | |
1033 | - */ | |
1034 | - if (timeout < 0) | |
1035 | - { | |
1036 | - printk(KERN_ERR "schedule_timeout: wrong timeout " | |
1037 | - "value %lx from %p\n", timeout, | |
1038 | - __builtin_return_address(0)); | |
1039 | - current->state = TASK_RUNNING; | |
1040 | - goto out; | |
1041 | - } | |
1042 | - } | |
1043 | +#if CONFIG_SMP | |
1044 | +asmlinkage void schedule_tail(task_t *prev) | |
1045 | +{ | |
1046 | + spin_unlock_irq(&this_rq()->lock); | |
1047 | +} | |
1048 | +#endif | |
1049 | ||
1050 | - expire = timeout + jiffies; | |
1051 | +static inline void context_switch(task_t *prev, task_t *next) | |
1052 | +{ | |
1053 | + struct mm_struct *mm = next->mm; | |
1054 | + struct mm_struct *oldmm = prev->active_mm; | |
1055 | ||
1056 | - init_timer(&timer); | |
1057 | - timer.expires = expire; | |
1058 | - timer.data = (unsigned long) current; | |
1059 | - timer.function = process_timeout; | |
1060 | + prepare_to_switch(); | |
1061 | ||
1062 | - add_timer(&timer); | |
1063 | - schedule(); | |
1064 | - del_timer_sync(&timer); | |
1065 | + if (unlikely(!mm)) { | |
1066 | + next->active_mm = oldmm; | |
1067 | + atomic_inc(&oldmm->mm_count); | |
1068 | + enter_lazy_tlb(oldmm, next, smp_processor_id()); | |
1069 | + } else | |
1070 | + switch_mm(oldmm, mm, next, smp_processor_id()); | |
1071 | ||
1072 | - timeout = expire - jiffies; | |
1073 | + if (unlikely(!prev->mm)) { | |
1074 | + prev->active_mm = NULL; | |
1075 | + mmdrop(oldmm); | |
1076 | + } | |
1077 | ||
1078 | - out: | |
1079 | - return timeout < 0 ? 0 : timeout; | |
1080 | + /* | |
1081 | + * Here we just switch the register state and the stack. There are | |
1082 | + * 3 processes affected by a context switch: | |
1083 | + * | |
1084 | + * prev ==> .... ==> (last => next) | |
1085 | + * | |
1086 | + * It's the 'much more previous' 'prev' that is on next's stack, | |
1087 | + * but prev is set to (the just run) 'last' process by switch_to(). | |
1088 | + * This might sound slightly confusing but makes tons of sense. | |
1089 | + */ | |
1090 | + switch_to(prev, next, prev); | |
1091 | } | |
1092 | ||
1093 | -/* | |
1094 | - * schedule_tail() is getting called from the fork return path. This | |
1095 | - * cleans up all remaining scheduler things, without impacting the | |
1096 | - * common case. | |
1097 | - */ | |
1098 | -static inline void __schedule_tail(struct task_struct *prev) | |
1099 | +unsigned long nr_running(void) | |
1100 | { | |
1101 | -#ifdef CONFIG_SMP | |
1102 | - int policy; | |
1103 | - | |
1104 | - /* | |
1105 | - * prev->policy can be written from here only before `prev' | |
1106 | - * can be scheduled (before setting prev->cpus_runnable to ~0UL). | |
1107 | - * Of course it must also be read before allowing prev | |
1108 | - * to be rescheduled, but since the write depends on the read | |
1109 | - * to complete, wmb() is enough. (the spin_lock() acquired | |
1110 | - * before setting cpus_runnable is not enough because the spin_lock() | |
1111 | - * common code semantics allows code outside the critical section | |
1112 | - * to enter inside the critical section) | |
1113 | - */ | |
1114 | - policy = prev->policy; | |
1115 | - prev->policy = policy & ~SCHED_YIELD; | |
1116 | - wmb(); | |
1117 | + unsigned long i, sum = 0; | |
1118 | ||
1119 | - /* | |
1120 | - * fast path falls through. We have to clear cpus_runnable before | |
1121 | - * checking prev->state to avoid a wakeup race. Protect against | |
1122 | - * the task exiting early. | |
1123 | - */ | |
1124 | - task_lock(prev); | |
1125 | - task_release_cpu(prev); | |
1126 | - mb(); | |
1127 | - if (prev->state == TASK_RUNNING) | |
1128 | - goto needs_resched; | |
1129 | + for (i = 0; i < smp_num_cpus; i++) | |
1130 | + sum += cpu_rq(cpu_logical_map(i))->nr_running; | |
1131 | ||
1132 | -out_unlock: | |
1133 | - task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */ | |
1134 | - return; | |
1135 | + return sum; | |
1136 | +} | |
1137 | ||
1138 | - /* | |
1139 | - * Slow path - we 'push' the previous process and | |
1140 | - * reschedule_idle() will attempt to find a new | |
1141 | - * processor for it. (but it might preempt the | |
1142 | - * current process as well.) We must take the runqueue | |
1143 | - * lock and re-check prev->state to be correct. It might | |
1144 | - * still happen that this process has a preemption | |
1145 | - * 'in progress' already - but this is not a problem and | |
1146 | - * might happen in other circumstances as well. | |
1147 | - */ | |
1148 | -needs_resched: | |
1149 | - { | |
1150 | - unsigned long flags; | |
1151 | +unsigned long nr_context_switches(void) | |
1152 | +{ | |
1153 | + unsigned long i, sum = 0; | |
1154 | ||
1155 | - /* | |
1156 | - * Avoid taking the runqueue lock in cases where | |
1157 | - * no preemption-check is necessery: | |
1158 | - */ | |
1159 | - if ((prev == idle_task(smp_processor_id())) || | |
1160 | - (policy & SCHED_YIELD)) | |
1161 | - goto out_unlock; | |
1162 | + for (i = 0; i < smp_num_cpus; i++) | |
1163 | + sum += cpu_rq(cpu_logical_map(i))->nr_switches; | |
1164 | ||
1165 | - spin_lock_irqsave(&runqueue_lock, flags); | |
1166 | - if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev)) | |
1167 | - reschedule_idle(prev); | |
1168 | - spin_unlock_irqrestore(&runqueue_lock, flags); | |
1169 | - goto out_unlock; | |
1170 | - } | |
1171 | -#else | |
1172 | - prev->policy &= ~SCHED_YIELD; | |
1173 | -#endif /* CONFIG_SMP */ | |
1174 | + return sum; | |
1175 | } | |
1176 | ||
1177 | -asmlinkage void schedule_tail(struct task_struct *prev) | |
1178 | +#if CONFIG_SMP | |
1179 | +/* | |
1180 | + * Lock the busiest runqueue as well, this_rq is locked already. | |
1181 | + * Recalculate nr_running if we have to drop the runqueue lock. | |
1182 | + */ | |
1183 | +static inline unsigned int double_lock_balance(runqueue_t *this_rq, | |
1184 | + runqueue_t *busiest, int this_cpu, int idle, unsigned int nr_running) | |
1185 | { | |
1186 | - __schedule_tail(prev); | |
1187 | + if (unlikely(!spin_trylock(&busiest->lock))) { | |
1188 | + if (busiest < this_rq) { | |
1189 | + spin_unlock(&this_rq->lock); | |
1190 | + spin_lock(&busiest->lock); | |
1191 | + spin_lock(&this_rq->lock); | |
1192 | + /* Need to recalculate nr_running */ | |
1193 | + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) | |
1194 | + nr_running = this_rq->nr_running; | |
1195 | + else | |
1196 | + nr_running = this_rq->prev_nr_running[this_cpu]; | |
1197 | + } else | |
1198 | + spin_lock(&busiest->lock); | |
1199 | + } | |
1200 | + return nr_running; | |
1201 | } | |
1202 | ||
1203 | /* | |
1204 | - * 'schedule()' is the scheduler function. It's a very simple and nice | |
1205 | - * scheduler: it's not perfect, but certainly works for most things. | |
1206 | - * | |
1207 | - * The goto is "interesting". | |
1208 | + * Current runqueue is empty, or rebalance tick: if there is an | |
1209 | + * inbalance (current runqueue is too short) then pull from | |
1210 | + * busiest runqueue(s). | |
1211 | * | |
1212 | - * NOTE!! Task 0 is the 'idle' task, which gets called when no other | |
1213 | - * tasks can run. It can not be killed, and it cannot sleep. The 'state' | |
1214 | - * information in task[0] is never used. | |
1215 | + * We call this with the current runqueue locked, | |
1216 | + * irqs disabled. | |
1217 | */ | |
1218 | -asmlinkage void schedule(void) | |
1219 | +static void load_balance(runqueue_t *this_rq, int idle) | |
1220 | { | |
1221 | - struct schedule_data * sched_data; | |
1222 | - struct task_struct *prev, *next, *p; | |
1223 | - struct list_head *tmp; | |
1224 | - int this_cpu, c; | |
1225 | + int imbalance, nr_running, load, max_load, | |
1226 | + idx, i, this_cpu = smp_processor_id(); | |
1227 | + task_t *next = this_rq->idle, *tmp; | |
1228 | + runqueue_t *busiest, *rq_src; | |
1229 | + prio_array_t *array; | |
1230 | + list_t *head, *curr; | |
1231 | ||
1232 | + /* | |
1233 | + * We search all runqueues to find the most busy one. | |
1234 | + * We do this lockless to reduce cache-bouncing overhead, | |
1235 | + * we re-check the 'best' source CPU later on again, with | |
1236 | + * the lock held. | |
1237 | + * | |
1238 | + * We fend off statistical fluctuations in runqueue lengths by | |
1239 | + * saving the runqueue length during the previous load-balancing | |
1240 | + * operation and using the smaller one the current and saved lengths. | |
1241 | + * If a runqueue is long enough for a longer amount of time then | |
1242 | + * we recognize it and pull tasks from it. | |
1243 | + * | |
1244 | + * The 'current runqueue length' is a statistical maximum variable, | |
1245 | + * for that one we take the longer one - to avoid fluctuations in | |
1246 | + * the other direction. So for a load-balance to happen it needs | |
1247 | + * stable long runqueue on the target CPU and stable short runqueue | |
1248 | + * on the local runqueue. | |
1249 | + * | |
1250 | + * We make an exception if this CPU is about to become idle - in | |
1251 | + * that case we are less picky about moving a task across CPUs and | |
1252 | + * take what can be taken. | |
1253 | + */ | |
1254 | + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) | |
1255 | + nr_running = this_rq->nr_running; | |
1256 | + else | |
1257 | + nr_running = this_rq->prev_nr_running[this_cpu]; | |
1258 | ||
1259 | - spin_lock_prefetch(&runqueue_lock); | |
1260 | + busiest = NULL; | |
1261 | + max_load = 1; | |
1262 | + for (i = 0; i < smp_num_cpus; i++) { | |
1263 | + rq_src = cpu_rq(cpu_logical_map(i)); | |
1264 | + if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i])) | |
1265 | + load = rq_src->nr_running; | |
1266 | + else | |
1267 | + load = this_rq->prev_nr_running[i]; | |
1268 | + this_rq->prev_nr_running[i] = rq_src->nr_running; | |
1269 | + | |
1270 | + if ((load > max_load) && (rq_src != this_rq)) { | |
1271 | + busiest = rq_src; | |
1272 | + max_load = load; | |
1273 | + } | |
1274 | + } | |
1275 | ||
1276 | - if (!current->active_mm) BUG(); | |
1277 | -need_resched_back: | |
1278 | - prev = current; | |
1279 | - this_cpu = prev->processor; | |
1280 | + if (likely(!busiest)) | |
1281 | + return; | |
1282 | ||
1283 | - if (unlikely(in_interrupt())) { | |
1284 | - printk("Scheduling in interrupt\n"); | |
1285 | - BUG(); | |
1286 | - } | |
1287 | + imbalance = (max_load - nr_running) / 2; | |
1288 | ||
1289 | - release_kernel_lock(prev, this_cpu); | |
1290 | + /* It needs an at least ~25% imbalance to trigger balancing. */ | |
1291 | + if (!idle && (imbalance < (max_load + 3)/4)) | |
1292 | + return; | |
1293 | ||
1294 | + nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running); | |
1295 | /* | |
1296 | - * 'sched_data' is protected by the fact that we can run | |
1297 | - * only one process per CPU. | |
1298 | + * Make sure nothing changed since we checked the | |
1299 | + * runqueue length. | |
1300 | */ | |
1301 | - sched_data = & aligned_data[this_cpu].schedule_data; | |
1302 | + if (busiest->nr_running <= this_rq->nr_running + 1) | |
1303 | + goto out_unlock; | |
1304 | ||
1305 | - spin_lock_irq(&runqueue_lock); | |
1306 | + /* | |
1307 | + * We first consider expired tasks. Those will likely not be | |
1308 | + * executed in the near future, and they are most likely to | |
1309 | + * be cache-cold, thus switching CPUs has the least effect | |
1310 | + * on them. | |
1311 | + */ | |
1312 | + if (busiest->expired->nr_active) | |
1313 | + array = busiest->expired; | |
1314 | + else | |
1315 | + array = busiest->active; | |
1316 | ||
1317 | - /* move an exhausted RR process to be last.. */ | |
1318 | - if (unlikely(prev->policy == SCHED_RR)) | |
1319 | - if (!prev->counter) { | |
1320 | - prev->counter = NICE_TO_TICKS(prev->nice); | |
1321 | - move_last_runqueue(prev); | |
1322 | +new_array: | |
1323 | + /* Start searching at priority 0: */ | |
1324 | + idx = 0; | |
1325 | +skip_bitmap: | |
1326 | + if (!idx) | |
1327 | + idx = sched_find_first_bit(array->bitmap); | |
1328 | + else | |
1329 | + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); | |
1330 | + if (idx == MAX_PRIO) { | |
1331 | + if (array == busiest->expired) { | |
1332 | + array = busiest->active; | |
1333 | + goto new_array; | |
1334 | } | |
1335 | - | |
1336 | - switch (prev->state) { | |
1337 | - case TASK_INTERRUPTIBLE: | |
1338 | - if (signal_pending(prev)) { | |
1339 | - prev->state = TASK_RUNNING; | |
1340 | - break; | |
1341 | - } | |
1342 | - default: | |
1343 | - del_from_runqueue(prev); | |
1344 | - case TASK_RUNNING:; | |
1345 | + goto out_unlock; | |
1346 | } | |
1347 | - prev->need_resched = 0; | |
1348 | - | |
1349 | - /* | |
1350 | - * this is the scheduler proper: | |
1351 | - */ | |
1352 | ||
1353 | -repeat_schedule: | |
1354 | - /* | |
1355 | - * Default process to select.. | |
1356 | - */ | |
1357 | - next = idle_task(this_cpu); | |
1358 | - c = -1000; | |
1359 | - list_for_each(tmp, &runqueue_head) { | |
1360 | - p = list_entry(tmp, struct task_struct, run_list); | |
1361 | - if (can_schedule(p, this_cpu)) { | |
1362 | - int weight = goodness(p, this_cpu, prev->active_mm); | |
1363 | - if (weight > c) | |
1364 | - c = weight, next = p; | |
1365 | + head = array->queue + idx; | |
1366 | + curr = head->prev; | |
1367 | +skip_queue: | |
1368 | + tmp = list_entry(curr, task_t, run_list); | |
1369 | + | |
1370 | + /* | |
1371 | + * We do not migrate tasks that are: | |
1372 | + * 1) running (obviously), or | |
1373 | + * 2) cannot be migrated to this CPU due to cpus_allowed, or | |
1374 | + * 3) are cache-hot on their current CPU. | |
1375 | + */ | |
1376 | + | |
1377 | +#define CAN_MIGRATE_TASK(p,rq,this_cpu) \ | |
1378 | + ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \ | |
1379 | + ((p) != (rq)->curr) && \ | |
1380 | + (tmp->cpus_allowed & (1 << (this_cpu)))) | |
1381 | + | |
1382 | + if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) { | |
1383 | + curr = curr->next; | |
1384 | + if (curr != head) | |
1385 | + goto skip_queue; | |
1386 | + idx++; | |
1387 | + goto skip_bitmap; | |
1388 | + } | |
1389 | + next = tmp; | |
1390 | + /* | |
1391 | + * take the task out of the other runqueue and | |
1392 | + * put it into this one: | |
1393 | + */ | |
1394 | + dequeue_task(next, array); | |
1395 | + busiest->nr_running--; | |
1396 | + next->cpu = this_cpu; | |
1397 | + this_rq->nr_running++; | |
1398 | + enqueue_task(next, this_rq->active); | |
1399 | + if (next->prio < current->prio) | |
1400 | + current->need_resched = 1; | |
1401 | + if (!idle && --imbalance) { | |
1402 | + if (array == busiest->expired) { | |
1403 | + array = busiest->active; | |
1404 | + goto new_array; | |
1405 | } | |
1406 | } | |
1407 | +out_unlock: | |
1408 | + spin_unlock(&busiest->lock); | |
1409 | +} | |
1410 | + | |
1411 | +/* | |
1412 | + * One of the idle_cpu_tick() or the busy_cpu_tick() function will | |
1413 | + * gets called every timer tick, on every CPU. Our balancing action | |
1414 | + * frequency and balancing agressivity depends on whether the CPU is | |
1415 | + * idle or not. | |
1416 | + * | |
1417 | + * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on | |
1418 | + * systems with HZ=100, every 10 msecs.) | |
1419 | + */ | |
1420 | +#define BUSY_REBALANCE_TICK (HZ/4 ?: 1) | |
1421 | +#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) | |
1422 | + | |
1423 | +static inline void idle_tick(void) | |
1424 | +{ | |
1425 | + if (jiffies % IDLE_REBALANCE_TICK) | |
1426 | + return; | |
1427 | + spin_lock(&this_rq()->lock); | |
1428 | + load_balance(this_rq(), 1); | |
1429 | + spin_unlock(&this_rq()->lock); | |
1430 | +} | |
1431 | + | |
1432 | +#endif | |
1433 | ||
1434 | - /* Do we need to re-calculate counters? */ | |
1435 | - if (unlikely(!c)) { | |
1436 | - struct task_struct *p; | |
1437 | - | |
1438 | - spin_unlock_irq(&runqueue_lock); | |
1439 | - read_lock(&tasklist_lock); | |
1440 | - for_each_task(p) | |
1441 | - p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice); | |
1442 | - read_unlock(&tasklist_lock); | |
1443 | - spin_lock_irq(&runqueue_lock); | |
1444 | - goto repeat_schedule; | |
1445 | +/* | |
1446 | + * We place interactive tasks back into the active array, if possible. | |
1447 | + * | |
1448 | + * To guarantee that this does not starve expired tasks we ignore the | |
1449 | + * interactivity of a task if the first expired task had to wait more | |
1450 | + * than a 'reasonable' amount of time. This deadline timeout is | |
1451 | + * load-dependent, as the frequency of array switched decreases with | |
1452 | + * increasing number of running tasks: | |
1453 | + */ | |
1454 | +#define EXPIRED_STARVING(rq) \ | |
1455 | + ((rq)->expired_timestamp && \ | |
1456 | + (jiffies - (rq)->expired_timestamp >= \ | |
1457 | + STARVATION_LIMIT * ((rq)->nr_running) + 1)) | |
1458 | + | |
1459 | +/* | |
1460 | + * This function gets called by the timer code, with HZ frequency. | |
1461 | + * We call it with interrupts disabled. | |
1462 | + */ | |
1463 | +void scheduler_tick(int user_tick, int system) | |
1464 | +{ | |
1465 | + int cpu = smp_processor_id(); | |
1466 | + runqueue_t *rq = this_rq(); | |
1467 | + task_t *p = current; | |
1468 | + | |
1469 | + if (p == rq->idle) { | |
1470 | + if (local_bh_count(cpu) || local_irq_count(cpu) > 1) | |
1471 | + kstat.per_cpu_system[cpu] += system; | |
1472 | +#if CONFIG_SMP | |
1473 | + idle_tick(); | |
1474 | +#endif | |
1475 | + return; | |
1476 | } | |
1477 | + if (TASK_NICE(p) > 0) | |
1478 | + kstat.per_cpu_nice[cpu] += user_tick; | |
1479 | + else | |
1480 | + kstat.per_cpu_user[cpu] += user_tick; | |
1481 | + kstat.per_cpu_system[cpu] += system; | |
1482 | ||
1483 | + /* Task might have expired already, but not scheduled off yet */ | |
1484 | + if (p->array != rq->active) { | |
1485 | + p->need_resched = 1; | |
1486 | + return; | |
1487 | + } | |
1488 | + spin_lock(&rq->lock); | |
1489 | + if (unlikely(rt_task(p))) { | |
1490 | + /* | |
1491 | + * RR tasks need a special form of timeslice management. | |
1492 | + * FIFO tasks have no timeslices. | |
1493 | + */ | |
1494 | + if ((p->policy == SCHED_RR) && !--p->time_slice) { | |
1495 | + p->time_slice = TASK_TIMESLICE(p); | |
1496 | + p->need_resched = 1; | |
1497 | + | |
1498 | + /* put it at the end of the queue: */ | |
1499 | + dequeue_task(p, rq->active); | |
1500 | + enqueue_task(p, rq->active); | |
1501 | + } | |
1502 | + goto out; | |
1503 | + } | |
1504 | /* | |
1505 | - * from this point on nothing can prevent us from | |
1506 | - * switching to the next task, save this fact in | |
1507 | - * sched_data. | |
1508 | - */ | |
1509 | - sched_data->curr = next; | |
1510 | - task_set_cpu(next, this_cpu); | |
1511 | - spin_unlock_irq(&runqueue_lock); | |
1512 | - | |
1513 | - if (unlikely(prev == next)) { | |
1514 | - /* We won't go through the normal tail, so do this by hand */ | |
1515 | - prev->policy &= ~SCHED_YIELD; | |
1516 | - goto same_process; | |
1517 | + * The task was running during this tick - update the | |
1518 | + * time slice counter and the sleep average. Note: we | |
1519 | + * do not update a process's priority until it either | |
1520 | + * goes to sleep or uses up its timeslice. This makes | |
1521 | + * it possible for interactive tasks to use up their | |
1522 | + * timeslices at their highest priority levels. | |
1523 | + */ | |
1524 | + if (p->sleep_avg) | |
1525 | + p->sleep_avg--; | |
1526 | + if (!--p->time_slice) { | |
1527 | + dequeue_task(p, rq->active); | |
1528 | + p->need_resched = 1; | |
1529 | + p->prio = effective_prio(p); | |
1530 | + p->time_slice = TASK_TIMESLICE(p); | |
1531 | + | |
1532 | + if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { | |
1533 | + if (!rq->expired_timestamp) | |
1534 | + rq->expired_timestamp = jiffies; | |
1535 | + enqueue_task(p, rq->expired); | |
1536 | + } else | |
1537 | + enqueue_task(p, rq->active); | |
1538 | } | |
1539 | +out: | |
1540 | +#if CONFIG_SMP | |
1541 | + if (!(jiffies % BUSY_REBALANCE_TICK)) | |
1542 | + load_balance(rq, 0); | |
1543 | +#endif | |
1544 | + spin_unlock(&rq->lock); | |
1545 | +} | |
1546 | ||
1547 | -#ifdef CONFIG_SMP | |
1548 | - /* | |
1549 | - * maintain the per-process 'last schedule' value. | |
1550 | - * (this has to be recalculated even if we reschedule to | |
1551 | - * the same process) Currently this is only used on SMP, | |
1552 | - * and it's approximate, so we do not have to maintain | |
1553 | - * it while holding the runqueue spinlock. | |
1554 | - */ | |
1555 | - sched_data->last_schedule = get_cycles(); | |
1556 | +void scheduling_functions_start_here(void) { } | |
1557 | ||
1558 | - /* | |
1559 | - * We drop the scheduler lock early (it's a global spinlock), | |
1560 | - * thus we have to lock the previous process from getting | |
1561 | - * rescheduled during switch_to(). | |
1562 | - */ | |
1563 | +/* | |
1564 | + * 'schedule()' is the main scheduler function. | |
1565 | + */ | |
1566 | +asmlinkage void schedule(void) | |
1567 | +{ | |
1568 | + task_t *prev = current, *next; | |
1569 | + runqueue_t *rq = this_rq(); | |
1570 | + prio_array_t *array; | |
1571 | + list_t *queue; | |
1572 | + int idx; | |
1573 | ||
1574 | -#endif /* CONFIG_SMP */ | |
1575 | + if (unlikely(in_interrupt())) | |
1576 | + BUG(); | |
1577 | + release_kernel_lock(prev, smp_processor_id()); | |
1578 | + prev->sleep_timestamp = jiffies; | |
1579 | + spin_lock_irq(&rq->lock); | |
1580 | ||
1581 | - kstat.context_swtch++; | |
1582 | - /* | |
1583 | - * there are 3 processes which are affected by a context switch: | |
1584 | - * | |
1585 | - * prev == .... ==> (last => next) | |
1586 | - * | |
1587 | - * It's the 'much more previous' 'prev' that is on next's stack, | |
1588 | - * but prev is set to (the just run) 'last' process by switch_to(). | |
1589 | - * This might sound slightly confusing but makes tons of sense. | |
1590 | - */ | |
1591 | - prepare_to_switch(); | |
1592 | - { | |
1593 | - struct mm_struct *mm = next->mm; | |
1594 | - struct mm_struct *oldmm = prev->active_mm; | |
1595 | - if (!mm) { | |
1596 | - if (next->active_mm) BUG(); | |
1597 | - next->active_mm = oldmm; | |
1598 | - atomic_inc(&oldmm->mm_count); | |
1599 | - enter_lazy_tlb(oldmm, next, this_cpu); | |
1600 | - } else { | |
1601 | - if (next->active_mm != mm) BUG(); | |
1602 | - switch_mm(oldmm, mm, next, this_cpu); | |
1603 | + switch (prev->state) { | |
1604 | + case TASK_INTERRUPTIBLE: | |
1605 | + if (unlikely(signal_pending(prev))) { | |
1606 | + prev->state = TASK_RUNNING; | |
1607 | + break; | |
1608 | } | |
1609 | + default: | |
1610 | + deactivate_task(prev, rq); | |
1611 | + case TASK_RUNNING: | |
1612 | + ; | |
1613 | + } | |
1614 | +#if CONFIG_SMP | |
1615 | +pick_next_task: | |
1616 | +#endif | |
1617 | + if (unlikely(!rq->nr_running)) { | |
1618 | +#if CONFIG_SMP | |
1619 | + load_balance(rq, 1); | |
1620 | + if (rq->nr_running) | |
1621 | + goto pick_next_task; | |
1622 | +#endif | |
1623 | + next = rq->idle; | |
1624 | + rq->expired_timestamp = 0; | |
1625 | + goto switch_tasks; | |
1626 | + } | |
1627 | ||
1628 | - if (!prev->mm) { | |
1629 | - prev->active_mm = NULL; | |
1630 | - mmdrop(oldmm); | |
1631 | - } | |
1632 | + array = rq->active; | |
1633 | + if (unlikely(!array->nr_active)) { | |
1634 | + /* | |
1635 | + * Switch the active and expired arrays. | |
1636 | + */ | |
1637 | + rq->active = rq->expired; | |
1638 | + rq->expired = array; | |
1639 | + array = rq->active; | |
1640 | + rq->expired_timestamp = 0; | |
1641 | } | |
1642 | ||
1643 | - /* | |
1644 | - * This just switches the register state and the | |
1645 | - * stack. | |
1646 | - */ | |
1647 | - switch_to(prev, next, prev); | |
1648 | - __schedule_tail(prev); | |
1649 | + idx = sched_find_first_bit(array->bitmap); | |
1650 | + queue = array->queue + idx; | |
1651 | + next = list_entry(queue->next, task_t, run_list); | |
1652 | + | |
1653 | +switch_tasks: | |
1654 | + prefetch(next); | |
1655 | + prev->need_resched = 0; | |
1656 | + | |
1657 | + if (likely(prev != next)) { | |
1658 | + rq->nr_switches++; | |
1659 | + rq->curr = next; | |
1660 | + context_switch(prev, next); | |
1661 | + /* | |
1662 | + * The runqueue pointer might be from another CPU | |
1663 | + * if the new task was last running on a different | |
1664 | + * CPU - thus re-load it. | |
1665 | + */ | |
1666 | + barrier(); | |
1667 | + rq = this_rq(); | |
1668 | + } | |
1669 | + spin_unlock_irq(&rq->lock); | |
1670 | ||
1671 | -same_process: | |
1672 | reacquire_kernel_lock(current); | |
1673 | - if (current->need_resched) | |
1674 | - goto need_resched_back; | |
1675 | return; | |
1676 | } | |
1677 | ||
1678 | /* | |
1679 | - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything | |
1680 | - * up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the | |
1681 | - * non-exclusive tasks and one exclusive task. | |
1682 | + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just | |
1683 | + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve | |
1684 | + * number) then we wake all the non-exclusive tasks and one exclusive task. | |
1685 | * | |
1686 | * There are circumstances in which we can try to wake a task which has already | |
1687 | - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero | |
1688 | - * in this (rare) case, and we handle it by contonuing to scan the queue. | |
1689 | + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | |
1690 | + * zero in this (rare) case, and we handle it by continuing to scan the queue. | |
1691 | */ | |
1692 | static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode, | |
1693 | int nr_exclusive, const int sync) | |
1694 | { | |
1695 | struct list_head *tmp; | |
1696 | - struct task_struct *p; | |
1697 | + task_t *p; | |
1698 | ||
1699 | - CHECK_MAGIC_WQHEAD(q); | |
1700 | - WQ_CHECK_LIST_HEAD(&q->task_list); | |
1701 | - | |
1702 | list_for_each(tmp,&q->task_list) { | |
1703 | unsigned int state; | |
1704 | - wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); | |
1705 | + wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); | |
1706 | ||
1707 | - CHECK_MAGIC(curr->__magic); | |
1708 | p = curr->task; | |
1709 | state = p->state; | |
1710 | - if (state & mode) { | |
1711 | - WQ_NOTE_WAKER(curr); | |
1712 | - if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) | |
1713 | - break; | |
1714 | - } | |
1715 | + if ((state & mode) && | |
1716 | + try_to_wake_up(p, sync) && | |
1717 | + ((curr->flags & WQ_FLAG_EXCLUSIVE) && | |
1718 | + !--nr_exclusive)) | |
1719 | + break; | |
1720 | } | |
1721 | } | |
1722 | ||
1723 | @@ -850,8 +959,71 @@ | |
1724 | return timeout; | |
1725 | } | |
1726 | ||
1727 | +/* | |
1728 | + * Change the current task's CPU affinity. Migrate the process to a | |
1729 | + * proper CPU and schedule away if the current CPU is removed from | |
1730 | + * the allowed bitmask. | |
1731 | + */ | |
1732 | +void set_cpus_allowed(task_t *p, unsigned long new_mask) | |
1733 | +{ | |
1734 | + new_mask &= cpu_online_map; | |
1735 | + if (!new_mask) | |
1736 | + BUG(); | |
1737 | + if (p != current) | |
1738 | + BUG(); | |
1739 | + | |
1740 | + p->cpus_allowed = new_mask; | |
1741 | + /* | |
1742 | + * Can the task run on the current CPU? If not then | |
1743 | + * migrate the process off to a proper CPU. | |
1744 | + */ | |
1745 | + if (new_mask & (1UL << smp_processor_id())) | |
1746 | + return; | |
1747 | +#if CONFIG_SMP | |
1748 | + current->state = TASK_UNINTERRUPTIBLE; | |
1749 | + smp_migrate_task(__ffs(new_mask), current); | |
1750 | + | |
1751 | + schedule(); | |
1752 | +#endif | |
1753 | +} | |
1754 | + | |
1755 | void scheduling_functions_end_here(void) { } | |
1756 | ||
1757 | +void set_user_nice(task_t *p, long nice) | |
1758 | +{ | |
1759 | + unsigned long flags; | |
1760 | + prio_array_t *array; | |
1761 | + runqueue_t *rq; | |
1762 | + | |
1763 | + if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | |
1764 | + return; | |
1765 | + /* | |
1766 | + * We have to be careful, if called from sys_setpriority(), | |
1767 | + * the task might be in the middle of scheduling on another CPU. | |
1768 | + */ | |
1769 | + rq = lock_task_rq(p, &flags); | |
1770 | + if (rt_task(p)) { | |
1771 | + p->static_prio = NICE_TO_PRIO(nice); | |
1772 | + goto out_unlock; | |
1773 | + } | |
1774 | + array = p->array; | |
1775 | + if (array) | |
1776 | + dequeue_task(p, array); | |
1777 | + p->static_prio = NICE_TO_PRIO(nice); | |
1778 | + p->prio = NICE_TO_PRIO(nice); | |
1779 | + if (array) { | |
1780 | + enqueue_task(p, array); | |
1781 | + /* | |
1782 | + * If the task is running and lowered its priority, | |
1783 | + * or increased its priority then reschedule its CPU: | |
1784 | + */ | |
1785 | + if ((NICE_TO_PRIO(nice) < p->static_prio) || (p == rq->curr)) | |
1786 | + resched_task(rq->curr); | |
1787 | + } | |
1788 | +out_unlock: | |
1789 | + unlock_task_rq(rq, &flags); | |
1790 | +} | |
1791 | + | |
1792 | #ifndef __alpha__ | |
1793 | ||
1794 | /* | |
1795 | @@ -862,7 +1034,7 @@ | |
1796 | ||
1797 | asmlinkage long sys_nice(int increment) | |
1798 | { | |
1799 | - long newprio; | |
1800 | + long nice; | |
1801 | ||
1802 | /* | |
1803 | * Setpriority might change our priority at the same moment. | |
1804 | @@ -878,32 +1050,46 @@ | |
1805 | if (increment > 40) | |
1806 | increment = 40; | |
1807 | ||
1808 | - newprio = current->nice + increment; | |
1809 | - if (newprio < -20) | |
1810 | - newprio = -20; | |
1811 | - if (newprio > 19) | |
1812 | - newprio = 19; | |
1813 | - current->nice = newprio; | |
1814 | + nice = PRIO_TO_NICE(current->static_prio) + increment; | |
1815 | + if (nice < -20) | |
1816 | + nice = -20; | |
1817 | + if (nice > 19) | |
1818 | + nice = 19; | |
1819 | + set_user_nice(current, nice); | |
1820 | return 0; | |
1821 | } | |
1822 | ||
1823 | #endif | |
1824 | ||
1825 | -static inline struct task_struct *find_process_by_pid(pid_t pid) | |
1826 | +/* | |
1827 | + * This is the priority value as seen by users in /proc | |
1828 | + * | |
1829 | + * RT tasks are offset by -200. Normal tasks are centered | |
1830 | + * around 0, value goes from -16 to +15. | |
1831 | + */ | |
1832 | +int task_prio(task_t *p) | |
1833 | { | |
1834 | - struct task_struct *tsk = current; | |
1835 | + return p->prio - 100; | |
1836 | +} | |
1837 | ||
1838 | - if (pid) | |
1839 | - tsk = find_task_by_pid(pid); | |
1840 | - return tsk; | |
1841 | +int task_nice(task_t *p) | |
1842 | +{ | |
1843 | + return TASK_NICE(p); | |
1844 | +} | |
1845 | + | |
1846 | +static inline task_t *find_process_by_pid(pid_t pid) | |
1847 | +{ | |
1848 | + return pid ? find_task_by_pid(pid) : current; | |
1849 | } | |
1850 | ||
1851 | -static int setscheduler(pid_t pid, int policy, | |
1852 | - struct sched_param *param) | |
1853 | +static int setscheduler(pid_t pid, int policy, struct sched_param *param) | |
1854 | { | |
1855 | struct sched_param lp; | |
1856 | - struct task_struct *p; | |
1857 | + prio_array_t *array; | |
1858 | + unsigned long flags; | |
1859 | + runqueue_t *rq; | |
1860 | int retval; | |
1861 | + task_t *p; | |
1862 | ||
1863 | retval = -EINVAL; | |
1864 | if (!param || pid < 0) | |
1865 | @@ -917,14 +1103,19 @@ | |
1866 | * We play safe to avoid deadlocks. | |
1867 | */ | |
1868 | read_lock_irq(&tasklist_lock); | |
1869 | - spin_lock(&runqueue_lock); | |
1870 | ||
1871 | p = find_process_by_pid(pid); | |
1872 | ||
1873 | retval = -ESRCH; | |
1874 | if (!p) | |
1875 | - goto out_unlock; | |
1876 | - | |
1877 | + goto out_unlock_tasklist; | |
1878 | + | |
1879 | + /* | |
1880 | + * To be able to change p->policy safely, the apropriate | |
1881 | + * runqueue lock must be held. | |
1882 | + */ | |
1883 | + rq = lock_task_rq(p, &flags); | |
1884 | + | |
1885 | if (policy < 0) | |
1886 | policy = p->policy; | |
1887 | else { | |
1888 | @@ -945,30 +1136,36 @@ | |
1889 | goto out_unlock; | |
1890 | ||
1891 | retval = -EPERM; | |
1892 | - if ((policy == SCHED_FIFO || policy == SCHED_RR) && | |
1893 | + if ((policy == SCHED_FIFO || policy == SCHED_RR) && | |
1894 | !capable(CAP_SYS_NICE)) | |
1895 | goto out_unlock; | |
1896 | if ((current->euid != p->euid) && (current->euid != p->uid) && | |
1897 | !capable(CAP_SYS_NICE)) | |
1898 | goto out_unlock; | |
1899 | ||
1900 | + array = p->array; | |
1901 | + if (array) | |
1902 | + deactivate_task(p, task_rq(p)); | |
1903 | retval = 0; | |
1904 | p->policy = policy; | |
1905 | p->rt_priority = lp.sched_priority; | |
1906 | - if (task_on_runqueue(p)) | |
1907 | - move_first_runqueue(p); | |
1908 | - | |
1909 | - current->need_resched = 1; | |
1910 | + if (rt_task(p)) | |
1911 | + p->prio = 99 - p->rt_priority; | |
1912 | + else | |
1913 | + p->prio = p->static_prio; | |
1914 | + if (array) | |
1915 | + activate_task(p, task_rq(p)); | |
1916 | ||
1917 | out_unlock: | |
1918 | - spin_unlock(&runqueue_lock); | |
1919 | + unlock_task_rq(rq, &flags); | |
1920 | +out_unlock_tasklist: | |
1921 | read_unlock_irq(&tasklist_lock); | |
1922 | ||
1923 | out_nounlock: | |
1924 | return retval; | |
1925 | } | |
1926 | ||
1927 | -asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, | |
1928 | +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, | |
1929 | struct sched_param *param) | |
1930 | { | |
1931 | return setscheduler(pid, policy, param); | |
1932 | @@ -981,7 +1178,7 @@ | |
1933 | ||
1934 | asmlinkage long sys_sched_getscheduler(pid_t pid) | |
1935 | { | |
1936 | - struct task_struct *p; | |
1937 | + task_t *p; | |
1938 | int retval; | |
1939 | ||
1940 | retval = -EINVAL; | |
1941 | @@ -992,7 +1189,7 @@ | |
1942 | read_lock(&tasklist_lock); | |
1943 | p = find_process_by_pid(pid); | |
1944 | if (p) | |
1945 | - retval = p->policy & ~SCHED_YIELD; | |
1946 | + retval = p->policy; | |
1947 | read_unlock(&tasklist_lock); | |
1948 | ||
1949 | out_nounlock: | |
1950 | @@ -1001,7 +1198,7 @@ | |
1951 | ||
1952 | asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param) | |
1953 | { | |
1954 | - struct task_struct *p; | |
1955 | + task_t *p; | |
1956 | struct sched_param lp; | |
1957 | int retval; | |
1958 | ||
1959 | @@ -1032,42 +1229,64 @@ | |
1960 | ||
1961 | asmlinkage long sys_sched_yield(void) | |
1962 | { | |
1963 | + task_t *prev = current, *next; | |
1964 | + runqueue_t *rq = this_rq(); | |
1965 | + prio_array_t *array; | |
1966 | + list_t *queue; | |
1967 | + | |
1968 | + if (unlikely(prev->state != TASK_RUNNING)) { | |
1969 | + schedule(); | |
1970 | + return 0; | |
1971 | + } | |
1972 | + release_kernel_lock(prev, smp_processor_id()); | |
1973 | + prev->sleep_timestamp = jiffies; | |
1974 | /* | |
1975 | - * Trick. sched_yield() first counts the number of truly | |
1976 | - * 'pending' runnable processes, then returns if it's | |
1977 | - * only the current processes. (This test does not have | |
1978 | - * to be atomic.) In threaded applications this optimization | |
1979 | - * gets triggered quite often. | |
1980 | + * Decrease the yielding task's priority by one, to avoid | |
1981 | + * livelocks. This priority loss is temporary, it's recovered | |
1982 | + * once the current timeslice expires. | |
1983 | + * | |
1984 | + * If priority is already MAX_PRIO-1 then we still | |
1985 | + * roundrobin the task within the runlist. | |
1986 | */ | |
1987 | + spin_lock_irq(&rq->lock); | |
1988 | + array = current->array; | |
1989 | + /* | |
1990 | + * If the task has reached maximum priority (or is a RT task) | |
1991 | + * then just requeue the task to the end of the runqueue: | |
1992 | + */ | |
1993 | + if (likely(current->prio == MAX_PRIO-1 || rt_task(current))) { | |
1994 | + list_del(¤t->run_list); | |
1995 | + list_add_tail(¤t->run_list, array->queue + current->prio); | |
1996 | + } else { | |
1997 | + list_del(¤t->run_list); | |
1998 | + if (list_empty(array->queue + current->prio)) | |
1999 | + __clear_bit(current->prio, array->bitmap); | |
2000 | + current->prio++; | |
2001 | + list_add_tail(¤t->run_list, array->queue + current->prio); | |
2002 | + __set_bit(current->prio, array->bitmap); | |
2003 | + } | |
2004 | + /* | |
2005 | + * Context-switch manually. This is equivalent to | |
2006 | + * calling schedule(), but faster, because yield() | |
2007 | + * knows lots of things that can be optimized away | |
2008 | + * from the generic scheduler path: | |
2009 | + */ | |
2010 | + queue = array->queue + sched_find_first_bit(array->bitmap); | |
2011 | + next = list_entry(queue->next, task_t, run_list); | |
2012 | + prefetch(next); | |
2013 | ||
2014 | - int nr_pending = nr_running; | |
2015 | - | |
2016 | -#if CONFIG_SMP | |
2017 | - int i; | |
2018 | - | |
2019 | - // Subtract non-idle processes running on other CPUs. | |
2020 | - for (i = 0; i < smp_num_cpus; i++) { | |
2021 | - int cpu = cpu_logical_map(i); | |
2022 | - if (aligned_data[cpu].schedule_data.curr != idle_task(cpu)) | |
2023 | - nr_pending--; | |
2024 | + prev->need_resched = 0; | |
2025 | + if (likely(prev != next)) { | |
2026 | + rq->nr_switches++; | |
2027 | + rq->curr = next; | |
2028 | + context_switch(prev, next); | |
2029 | + barrier(); | |
2030 | + rq = this_rq(); | |
2031 | } | |
2032 | -#else | |
2033 | - // on UP this process is on the runqueue as well | |
2034 | - nr_pending--; | |
2035 | -#endif | |
2036 | - if (nr_pending) { | |
2037 | - /* | |
2038 | - * This process can only be rescheduled by us, | |
2039 | - * so this is safe without any locking. | |
2040 | - */ | |
2041 | - if (current->policy == SCHED_OTHER) | |
2042 | - current->policy |= SCHED_YIELD; | |
2043 | - current->need_resched = 1; | |
2044 | + spin_unlock_irq(&rq->lock); | |
2045 | + | |
2046 | + reacquire_kernel_lock(current); | |
2047 | ||
2048 | - spin_lock_irq(&runqueue_lock); | |
2049 | - move_last_runqueue(current); | |
2050 | - spin_unlock_irq(&runqueue_lock); | |
2051 | - } | |
2052 | return 0; | |
2053 | } | |
2054 | ||
2055 | @@ -1105,7 +1324,7 @@ | |
2056 | asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval) | |
2057 | { | |
2058 | struct timespec t; | |
2059 | - struct task_struct *p; | |
2060 | + task_t *p; | |
2061 | int retval = -EINVAL; | |
2062 | ||
2063 | if (pid < 0) | |
2064 | @@ -1115,8 +1334,8 @@ | |
2065 | read_lock(&tasklist_lock); | |
2066 | p = find_process_by_pid(pid); | |
2067 | if (p) | |
2068 | - jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice), | |
2069 | - &t); | |
2070 | + jiffies_to_timespec(p->policy & SCHED_FIFO ? | |
2071 | + 0 : TASK_TIMESLICE(p), &t); | |
2072 | read_unlock(&tasklist_lock); | |
2073 | if (p) | |
2074 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | |
2075 | @@ -1124,14 +1343,14 @@ | |
2076 | return retval; | |
2077 | } | |
2078 | ||
2079 | -static void show_task(struct task_struct * p) | |
2080 | +static void show_task(task_t * p) | |
2081 | { | |
2082 | unsigned long free = 0; | |
2083 | int state; | |
2084 | static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" }; | |
2085 | ||
2086 | printk("%-13.13s ", p->comm); | |
2087 | - state = p->state ? ffz(~p->state) + 1 : 0; | |
2088 | + state = p->state ? __ffs(p->state) + 1 : 0; | |
2089 | if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *)) | |
2090 | printk(stat_nam[state]); | |
2091 | else | |
2092 | @@ -1172,7 +1391,7 @@ | |
2093 | printk(" (NOTLB)\n"); | |
2094 | ||
2095 | { | |
2096 | - extern void show_trace_task(struct task_struct *tsk); | |
2097 | + extern void show_trace_task(task_t *tsk); | |
2098 | show_trace_task(p); | |
2099 | } | |
2100 | } | |
2101 | @@ -1194,7 +1413,7 @@ | |
2102 | ||
2103 | void show_state(void) | |
2104 | { | |
2105 | - struct task_struct *p; | |
2106 | + task_t *p; | |
2107 | ||
2108 | #if (BITS_PER_LONG == 32) | |
2109 | printk("\n" | |
2110 | @@ -1217,121 +1436,88 @@ | |
2111 | read_unlock(&tasklist_lock); | |
2112 | } | |
2113 | ||
2114 | -/** | |
2115 | - * reparent_to_init() - Reparent the calling kernel thread to the init task. | |
2116 | - * | |
2117 | - * If a kernel thread is launched as a result of a system call, or if | |
2118 | - * it ever exits, it should generally reparent itself to init so that | |
2119 | - * it is correctly cleaned up on exit. | |
2120 | - * | |
2121 | - * The various task state such as scheduling policy and priority may have | |
2122 | - * been inherited fro a user process, so we reset them to sane values here. | |
2123 | - * | |
2124 | - * NOTE that reparent_to_init() gives the caller full capabilities. | |
2125 | - */ | |
2126 | -void reparent_to_init(void) | |
2127 | +static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | |
2128 | { | |
2129 | - struct task_struct *this_task = current; | |
2130 | - | |
2131 | - write_lock_irq(&tasklist_lock); | |
2132 | - | |
2133 | - /* Reparent to init */ | |
2134 | - REMOVE_LINKS(this_task); | |
2135 | - this_task->p_pptr = child_reaper; | |
2136 | - this_task->p_opptr = child_reaper; | |
2137 | - SET_LINKS(this_task); | |
2138 | - | |
2139 | - /* Set the exit signal to SIGCHLD so we signal init on exit */ | |
2140 | - this_task->exit_signal = SIGCHLD; | |
2141 | - | |
2142 | - /* We also take the runqueue_lock while altering task fields | |
2143 | - * which affect scheduling decisions */ | |
2144 | - spin_lock(&runqueue_lock); | |
2145 | - | |
2146 | - this_task->ptrace = 0; | |
2147 | - this_task->nice = DEF_NICE; | |
2148 | - this_task->policy = SCHED_OTHER; | |
2149 | - /* cpus_allowed? */ | |
2150 | - /* rt_priority? */ | |
2151 | - /* signals? */ | |
2152 | - this_task->cap_effective = CAP_INIT_EFF_SET; | |
2153 | - this_task->cap_inheritable = CAP_INIT_INH_SET; | |
2154 | - this_task->cap_permitted = CAP_FULL_SET; | |
2155 | - this_task->keep_capabilities = 0; | |
2156 | - memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim))); | |
2157 | - this_task->user = INIT_USER; | |
2158 | - | |
2159 | - spin_unlock(&runqueue_lock); | |
2160 | - write_unlock_irq(&tasklist_lock); | |
2161 | + if (rq1 == rq2) | |
2162 | + spin_lock(&rq1->lock); | |
2163 | + else { | |
2164 | + if (rq1 < rq2) { | |
2165 | + spin_lock(&rq1->lock); | |
2166 | + spin_lock(&rq2->lock); | |
2167 | + } else { | |
2168 | + spin_lock(&rq2->lock); | |
2169 | + spin_lock(&rq1->lock); | |
2170 | + } | |
2171 | + } | |
2172 | } | |
2173 | ||
2174 | -/* | |
2175 | - * Put all the gunge required to become a kernel thread without | |
2176 | - * attached user resources in one place where it belongs. | |
2177 | - */ | |
2178 | - | |
2179 | -void daemonize(void) | |
2180 | +static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) | |
2181 | { | |
2182 | - struct fs_struct *fs; | |
2183 | - | |
2184 | - | |
2185 | - /* | |
2186 | - * If we were started as result of loading a module, close all of the | |
2187 | - * user space pages. We don't need them, and if we didn't close them | |
2188 | - * they would be locked into memory. | |
2189 | - */ | |
2190 | - exit_mm(current); | |
2191 | - | |
2192 | - current->session = 1; | |
2193 | - current->pgrp = 1; | |
2194 | - current->tty = NULL; | |
2195 | - | |
2196 | - /* Become as one with the init task */ | |
2197 | - | |
2198 | - exit_fs(current); /* current->fs->count--; */ | |
2199 | - fs = init_task.fs; | |
2200 | - current->fs = fs; | |
2201 | - atomic_inc(&fs->count); | |
2202 | - exit_files(current); | |
2203 | - current->files = init_task.files; | |
2204 | - atomic_inc(¤t->files->count); | |
2205 | + spin_unlock(&rq1->lock); | |
2206 | + if (rq1 != rq2) | |
2207 | + spin_unlock(&rq2->lock); | |
2208 | } | |
2209 | ||
2210 | -extern unsigned long wait_init_idle; | |
2211 | - | |
2212 | -void __init init_idle(void) | |
2213 | +void __init init_idle(task_t *idle, int cpu) | |
2214 | { | |
2215 | - struct schedule_data * sched_data; | |
2216 | - sched_data = &aligned_data[smp_processor_id()].schedule_data; | |
2217 | + runqueue_t *idle_rq = cpu_rq(cpu), *rq = idle->array->rq; | |
2218 | + unsigned long flags; | |
2219 | ||
2220 | - if (current != &init_task && task_on_runqueue(current)) { | |
2221 | - printk("UGH! (%d:%d) was on the runqueue, removing.\n", | |
2222 | - smp_processor_id(), current->pid); | |
2223 | - del_from_runqueue(current); | |
2224 | - } | |
2225 | - sched_data->curr = current; | |
2226 | - sched_data->last_schedule = get_cycles(); | |
2227 | - clear_bit(current->processor, &wait_init_idle); | |
2228 | + __save_flags(flags); | |
2229 | + __cli(); | |
2230 | + double_rq_lock(idle_rq, rq); | |
2231 | + | |
2232 | + idle_rq->curr = idle_rq->idle = idle; | |
2233 | + deactivate_task(idle, rq); | |
2234 | + idle->array = NULL; | |
2235 | + idle->prio = MAX_PRIO; | |
2236 | + idle->state = TASK_RUNNING; | |
2237 | + idle->cpu = cpu; | |
2238 | + double_rq_unlock(idle_rq, rq); | |
2239 | + idle->need_resched = 1; | |
2240 | + __restore_flags(flags); | |
2241 | } | |
2242 | ||
2243 | -extern void init_timervecs (void); | |
2244 | +extern void init_timervecs(void); | |
2245 | +extern void timer_bh(void); | |
2246 | +extern void tqueue_bh(void); | |
2247 | +extern void immediate_bh(void); | |
2248 | ||
2249 | void __init sched_init(void) | |
2250 | { | |
2251 | + runqueue_t *rq; | |
2252 | + int i, j, k; | |
2253 | + | |
2254 | + for (i = 0; i < NR_CPUS; i++) { | |
2255 | + runqueue_t *rq = cpu_rq(i); | |
2256 | + prio_array_t *array; | |
2257 | + | |
2258 | + rq->active = rq->arrays + 0; | |
2259 | + rq->expired = rq->arrays + 1; | |
2260 | + spin_lock_init(&rq->lock); | |
2261 | + | |
2262 | + for (j = 0; j < 2; j++) { | |
2263 | + array = rq->arrays + j; | |
2264 | + array->rq = rq; | |
2265 | + array->lock = &rq->lock; | |
2266 | + for (k = 0; k < MAX_PRIO; k++) { | |
2267 | + INIT_LIST_HEAD(array->queue + k); | |
2268 | + __clear_bit(k, array->bitmap); | |
2269 | + } | |
2270 | + // delimiter for bitsearch | |
2271 | + __set_bit(MAX_PRIO, array->bitmap); | |
2272 | + } | |
2273 | + } | |
2274 | /* | |
2275 | * We have to do a little magic to get the first | |
2276 | * process right in SMP mode. | |
2277 | */ | |
2278 | - int cpu = smp_processor_id(); | |
2279 | - int nr; | |
2280 | - | |
2281 | - init_task.processor = cpu; | |
2282 | - | |
2283 | - for(nr = 0; nr < PIDHASH_SZ; nr++) | |
2284 | - pidhash[nr] = NULL; | |
2285 | + rq = this_rq(); | |
2286 | + rq->curr = current; | |
2287 | + rq->idle = current; | |
2288 | + wake_up_process(current); | |
2289 | ||
2290 | init_timervecs(); | |
2291 | - | |
2292 | init_bh(TIMER_BH, timer_bh); | |
2293 | init_bh(TQUEUE_BH, tqueue_bh); | |
2294 | init_bh(IMMEDIATE_BH, immediate_bh); | |
2295 | @@ -1340,5 +1526,5 @@ | |
2296 | * The boot idle thread does lazy MMU switching as well: | |
2297 | */ | |
2298 | atomic_inc(&init_mm.mm_count); | |
2299 | - enter_lazy_tlb(&init_mm, current, cpu); | |
2300 | + enter_lazy_tlb(&init_mm, current, smp_processor_id()); | |
2301 | } | |
2302 | --- linux/kernel/exit.c.orig Tue Feb 5 13:51:53 2002 | |
2303 | +++ linux/kernel/exit.c Tue Feb 5 13:52:12 2002 | |
2304 | @@ -27,49 +27,22 @@ | |
2305 | ||
2306 | static void release_task(struct task_struct * p) | |
2307 | { | |
2308 | - if (p != current) { | |
2309 | + if (p == current) | |
2310 | + BUG(); | |
2311 | #ifdef CONFIG_SMP | |
2312 | - /* | |
2313 | - * Wait to make sure the process isn't on the | |
2314 | - * runqueue (active on some other CPU still) | |
2315 | - */ | |
2316 | - for (;;) { | |
2317 | - task_lock(p); | |
2318 | - if (!task_has_cpu(p)) | |
2319 | - break; | |
2320 | - task_unlock(p); | |
2321 | - do { | |
2322 | - cpu_relax(); | |
2323 | - barrier(); | |
2324 | - } while (task_has_cpu(p)); | |
2325 | - } | |
2326 | - task_unlock(p); | |
2327 | + wait_task_inactive(p); | |
2328 | #endif | |
2329 | - atomic_dec(&p->user->processes); | |
2330 | - free_uid(p->user); | |
2331 | - unhash_process(p); | |
2332 | - | |
2333 | - release_thread(p); | |
2334 | - current->cmin_flt += p->min_flt + p->cmin_flt; | |
2335 | - current->cmaj_flt += p->maj_flt + p->cmaj_flt; | |
2336 | - current->cnswap += p->nswap + p->cnswap; | |
2337 | - /* | |
2338 | - * Potentially available timeslices are retrieved | |
2339 | - * here - this way the parent does not get penalized | |
2340 | - * for creating too many processes. | |
2341 | - * | |
2342 | - * (this cannot be used to artificially 'generate' | |
2343 | - * timeslices, because any timeslice recovered here | |
2344 | - * was given away by the parent in the first place.) | |
2345 | - */ | |
2346 | - current->counter += p->counter; | |
2347 | - if (current->counter >= MAX_COUNTER) | |
2348 | - current->counter = MAX_COUNTER; | |
2349 | - p->pid = 0; | |
2350 | - free_task_struct(p); | |
2351 | - } else { | |
2352 | - printk("task releasing itself\n"); | |
2353 | - } | |
2354 | + atomic_dec(&p->user->processes); | |
2355 | + free_uid(p->user); | |
2356 | + unhash_process(p); | |
2357 | + | |
2358 | + release_thread(p); | |
2359 | + current->cmin_flt += p->min_flt + p->cmin_flt; | |
2360 | + current->cmaj_flt += p->maj_flt + p->cmaj_flt; | |
2361 | + current->cnswap += p->nswap + p->cnswap; | |
2362 | + sched_exit(p); | |
2363 | + p->pid = 0; | |
2364 | + free_task_struct(p); | |
2365 | } | |
2366 | ||
2367 | /* | |
2368 | @@ -147,6 +120,79 @@ | |
2369 | } | |
2370 | read_unlock(&tasklist_lock); | |
2371 | return retval; | |
2372 | +} | |
2373 | + | |
2374 | +/** | |
2375 | + * reparent_to_init() - Reparent the calling kernel thread to the init task. | |
2376 | + * | |
2377 | + * If a kernel thread is launched as a result of a system call, or if | |
2378 | + * it ever exits, it should generally reparent itself to init so that | |
2379 | + * it is correctly cleaned up on exit. | |
2380 | + * | |
2381 | + * The various task state such as scheduling policy and priority may have | |
2382 | + * been inherited from a user process, so we reset them to sane values here. | |
2383 | + * | |
2384 | + * NOTE that reparent_to_init() gives the caller full capabilities. | |
2385 | + */ | |
2386 | +void reparent_to_init(void) | |
2387 | +{ | |
2388 | + write_lock_irq(&tasklist_lock); | |
2389 | + | |
2390 | + /* Reparent to init */ | |
2391 | + REMOVE_LINKS(current); | |
2392 | + current->p_pptr = child_reaper; | |
2393 | + current->p_opptr = child_reaper; | |
2394 | + SET_LINKS(current); | |
2395 | + | |
2396 | + /* Set the exit signal to SIGCHLD so we signal init on exit */ | |
2397 | + current->exit_signal = SIGCHLD; | |
2398 | + | |
2399 | + current->ptrace = 0; | |
2400 | + if ((current->policy == SCHED_OTHER) && (task_nice(current) < 0)) | |
2401 | + set_user_nice(current, 0); | |
2402 | + /* cpus_allowed? */ | |
2403 | + /* rt_priority? */ | |
2404 | + /* signals? */ | |
2405 | + current->cap_effective = CAP_INIT_EFF_SET; | |
2406 | + current->cap_inheritable = CAP_INIT_INH_SET; | |
2407 | + current->cap_permitted = CAP_FULL_SET; | |
2408 | + current->keep_capabilities = 0; | |
2409 | + memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim))); | |
2410 | + current->user = INIT_USER; | |
2411 | + | |
2412 | + write_unlock_irq(&tasklist_lock); | |
2413 | +} | |
2414 | + | |
2415 | +/* | |
2416 | + * Put all the gunge required to become a kernel thread without | |
2417 | + * attached user resources in one place where it belongs. | |
2418 | + */ | |
2419 | + | |
2420 | +void daemonize(void) | |
2421 | +{ | |
2422 | + struct fs_struct *fs; | |
2423 | + | |
2424 | + | |
2425 | + /* | |
2426 | + * If we were started as result of loading a module, close all of the | |
2427 | + * user space pages. We don't need them, and if we didn't close them | |
2428 | + * they would be locked into memory. | |
2429 | + */ | |
2430 | + exit_mm(current); | |
2431 | + | |
2432 | + current->session = 1; | |
2433 | + current->pgrp = 1; | |
2434 | + current->tty = NULL; | |
2435 | + | |
2436 | + /* Become as one with the init task */ | |
2437 | + | |
2438 | + exit_fs(current); /* current->fs->count--; */ | |
2439 | + fs = init_task.fs; | |
2440 | + current->fs = fs; | |
2441 | + atomic_inc(&fs->count); | |
2442 | + exit_files(current); | |
2443 | + current->files = init_task.files; | |
2444 | + atomic_inc(¤t->files->count); | |
2445 | } | |
2446 | ||
2447 | /* | |
2448 | --- linux/kernel/capability.c.orig Sat Jun 24 06:06:37 2000 | |
2449 | +++ linux/kernel/capability.c Tue Feb 5 13:52:12 2002 | |
2450 | @@ -8,6 +8,8 @@ | |
2451 | #include <linux/mm.h> | |
2452 | #include <asm/uaccess.h> | |
2453 | ||
2454 | +unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ | |
2455 | + | |
2456 | kernel_cap_t cap_bset = CAP_INIT_EFF_SET; | |
2457 | ||
2458 | /* Note: never hold tasklist_lock while spinning for this one */ | |
2459 | --- linux/kernel/timer.c.orig Tue Feb 5 13:51:43 2002 | |
2460 | +++ linux/kernel/timer.c Tue Feb 5 13:52:12 2002 | |
2461 | @@ -25,6 +25,8 @@ | |
2462 | ||
2463 | #include <asm/uaccess.h> | |
2464 | ||
2465 | +struct kernel_stat kstat; | |
2466 | + | |
2467 | /* | |
2468 | * Timekeeping variables | |
2469 | */ | |
2470 | @@ -582,18 +584,7 @@ | |
2471 | int cpu = smp_processor_id(), system = user_tick ^ 1; | |
2472 | ||
2473 | update_one_process(p, user_tick, system, cpu); | |
2474 | - if (p->pid) { | |
2475 | - if (--p->counter <= 0) { | |
2476 | - p->counter = 0; | |
2477 | - p->need_resched = 1; | |
2478 | - } | |
2479 | - if (p->nice > 0) | |
2480 | - kstat.per_cpu_nice[cpu] += user_tick; | |
2481 | - else | |
2482 | - kstat.per_cpu_user[cpu] += user_tick; | |
2483 | - kstat.per_cpu_system[cpu] += system; | |
2484 | - } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1) | |
2485 | - kstat.per_cpu_system[cpu] += system; | |
2486 | + scheduler_tick(user_tick, system); | |
2487 | } | |
2488 | ||
2489 | /* | |
2490 | @@ -794,6 +785,89 @@ | |
2491 | ||
2492 | #endif | |
2493 | ||
2494 | +static void process_timeout(unsigned long __data) | |
2495 | +{ | |
2496 | + wake_up_process((task_t *)__data); | |
2497 | +} | |
2498 | + | |
2499 | +/** | |
2500 | + * schedule_timeout - sleep until timeout | |
2501 | + * @timeout: timeout value in jiffies | |
2502 | + * | |
2503 | + * Make the current task sleep until @timeout jiffies have | |
2504 | + * elapsed. The routine will return immediately unless | |
2505 | + * the current task state has been set (see set_current_state()). | |
2506 | + * | |
2507 | + * You can set the task state as follows - | |
2508 | + * | |
2509 | + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to | |
2510 | + * pass before the routine returns. The routine will return 0 | |
2511 | + * | |
2512 | + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is | |
2513 | + * delivered to the current task. In this case the remaining time | |
2514 | + * in jiffies will be returned, or 0 if the timer expired in time | |
2515 | + * | |
2516 | + * The current task state is guaranteed to be TASK_RUNNING when this | |
2517 | + * routine returns. | |
2518 | + * | |
2519 | + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule | |
2520 | + * the CPU away without a bound on the timeout. In this case the return | |
2521 | + * value will be %MAX_SCHEDULE_TIMEOUT. | |
2522 | + * | |
2523 | + * In all cases the return value is guaranteed to be non-negative. | |
2524 | + */ | |
2525 | +signed long schedule_timeout(signed long timeout) | |
2526 | +{ | |
2527 | + struct timer_list timer; | |
2528 | + unsigned long expire; | |
2529 | + | |
2530 | + switch (timeout) | |
2531 | + { | |
2532 | + case MAX_SCHEDULE_TIMEOUT: | |
2533 | + /* | |
2534 | + * These two special cases are useful to be comfortable | |
2535 | + * in the caller. Nothing more. We could take | |
2536 | + * MAX_SCHEDULE_TIMEOUT from one of the negative value | |
2537 | + * but I' d like to return a valid offset (>=0) to allow | |
2538 | + * the caller to do everything it want with the retval. | |
2539 | + */ | |
2540 | + schedule(); | |
2541 | + goto out; | |
2542 | + default: | |
2543 | + /* | |
2544 | + * Another bit of PARANOID. Note that the retval will be | |
2545 | + * 0 since no piece of kernel is supposed to do a check | |
2546 | + * for a negative retval of schedule_timeout() (since it | |
2547 | + * should never happens anyway). You just have the printk() | |
2548 | + * that will tell you if something is gone wrong and where. | |
2549 | + */ | |
2550 | + if (timeout < 0) | |
2551 | + { | |
2552 | + printk(KERN_ERR "schedule_timeout: wrong timeout " | |
2553 | + "value %lx from %p\n", timeout, | |
2554 | + __builtin_return_address(0)); | |
2555 | + current->state = TASK_RUNNING; | |
2556 | + goto out; | |
2557 | + } | |
2558 | + } | |
2559 | + | |
2560 | + expire = timeout + jiffies; | |
2561 | + | |
2562 | + init_timer(&timer); | |
2563 | + timer.expires = expire; | |
2564 | + timer.data = (unsigned long) current; | |
2565 | + timer.function = process_timeout; | |
2566 | + | |
2567 | + add_timer(&timer); | |
2568 | + schedule(); | |
2569 | + del_timer_sync(&timer); | |
2570 | + | |
2571 | + timeout = expire - jiffies; | |
2572 | + | |
2573 | + out: | |
2574 | + return timeout < 0 ? 0 : timeout; | |
2575 | +} | |
2576 | + | |
2577 | /* Thread ID - the internal kernel "pid" */ | |
2578 | asmlinkage long sys_gettid(void) | |
2579 | { | |
2580 | @@ -840,4 +914,3 @@ | |
2581 | } | |
2582 | return 0; | |
2583 | } | |
2584 | - | |
2585 | --- linux/kernel/fork.c.orig Tue Feb 5 13:51:53 2002 | |
2586 | +++ linux/kernel/fork.c Tue Feb 5 13:52:12 2002 | |
2587 | @@ -28,7 +28,6 @@ | |
2588 | ||
2589 | /* The idle threads do not count.. */ | |
2590 | int nr_threads; | |
2591 | -int nr_running; | |
2592 | ||
2593 | int max_threads; | |
2594 | unsigned long total_forks; /* Handle normal Linux uptimes. */ | |
2595 | @@ -36,6 +35,8 @@ | |
2596 | ||
2597 | struct task_struct *pidhash[PIDHASH_SZ]; | |
2598 | ||
2599 | +rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ | |
2600 | + | |
2601 | void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait) | |
2602 | { | |
2603 | unsigned long flags; | |
2604 | @@ -564,6 +565,7 @@ | |
2605 | struct pt_regs *regs, unsigned long stack_size) | |
2606 | { | |
2607 | int retval; | |
2608 | + unsigned long flags; | |
2609 | struct task_struct *p; | |
2610 | struct completion vfork; | |
2611 | ||
2612 | @@ -619,8 +621,7 @@ | |
2613 | copy_flags(clone_flags, p); | |
2614 | p->pid = get_pid(clone_flags); | |
2615 | ||
2616 | - p->run_list.next = NULL; | |
2617 | - p->run_list.prev = NULL; | |
2618 | + INIT_LIST_HEAD(&p->run_list); | |
2619 | ||
2620 | p->p_cptr = NULL; | |
2621 | init_waitqueue_head(&p->wait_chldexit); | |
2622 | @@ -646,14 +647,15 @@ | |
2623 | #ifdef CONFIG_SMP | |
2624 | { | |
2625 | int i; | |
2626 | - p->cpus_runnable = ~0UL; | |
2627 | - p->processor = current->processor; | |
2628 | + | |
2629 | /* ?? should we just memset this ?? */ | |
2630 | for(i = 0; i < smp_num_cpus; i++) | |
2631 | - p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0; | |
2632 | + p->per_cpu_utime[cpu_logical_map(i)] = | |
2633 | + p->per_cpu_stime[cpu_logical_map(i)] = 0; | |
2634 | spin_lock_init(&p->sigmask_lock); | |
2635 | } | |
2636 | #endif | |
2637 | + p->array = NULL; | |
2638 | p->lock_depth = -1; /* -1 = no lock */ | |
2639 | p->start_time = jiffies; | |
2640 | ||
2641 | @@ -685,15 +687,27 @@ | |
2642 | p->pdeath_signal = 0; | |
2643 | ||
2644 | /* | |
2645 | - * "share" dynamic priority between parent and child, thus the | |
2646 | - * total amount of dynamic priorities in the system doesnt change, | |
2647 | - * more scheduling fairness. This is only important in the first | |
2648 | - * timeslice, on the long run the scheduling behaviour is unchanged. | |
2649 | - */ | |
2650 | - p->counter = (current->counter + 1) >> 1; | |
2651 | - current->counter >>= 1; | |
2652 | - if (!current->counter) | |
2653 | - current->need_resched = 1; | |
2654 | + * Share the timeslice between parent and child, thus the | |
2655 | + * total amount of pending timeslices in the system doesnt change, | |
2656 | + * resulting in more scheduling fairness. | |
2657 | + */ | |
2658 | + __save_flags(flags); | |
2659 | + __cli(); | |
2660 | + if (!current->time_slice) | |
2661 | + BUG(); | |
2662 | + p->time_slice = (current->time_slice + 1) >> 1; | |
2663 | + current->time_slice >>= 1; | |
2664 | + if (!current->time_slice) { | |
2665 | + /* | |
2666 | + * This case is rare, it happens when the parent has only | |
2667 | + * a single jiffy left from its timeslice. Taking the | |
2668 | + * runqueue lock is not a problem. | |
2669 | + */ | |
2670 | + current->time_slice = 1; | |
2671 | + scheduler_tick(0,0); | |
2672 | + } | |
2673 | + p->sleep_timestamp = jiffies; | |
2674 | + __restore_flags(flags); | |
2675 | ||
2676 | /* | |
2677 | * Ok, add it to the run-queues and make it | |
2678 | @@ -730,10 +744,23 @@ | |
2679 | if (p->ptrace & PT_PTRACED) | |
2680 | send_sig(SIGSTOP, p, 1); | |
2681 | ||
2682 | +#define RUN_CHILD_FIRST 1 | |
2683 | +#if RUN_CHILD_FIRST | |
2684 | + wake_up_forked_process(p); /* do this last */ | |
2685 | +#else | |
2686 | wake_up_process(p); /* do this last */ | |
2687 | +#endif | |
2688 | ++total_forks; | |
2689 | if (clone_flags & CLONE_VFORK) | |
2690 | wait_for_completion(&vfork); | |
2691 | +#if RUN_CHILD_FIRST | |
2692 | + else | |
2693 | + /* | |
2694 | + * Let the child process run first, to avoid most of the | |
2695 | + * COW overhead when the child exec()s afterwards. | |
2696 | + */ | |
2697 | + current->need_resched = 1; | |
2698 | +#endif | |
2699 | ||
2700 | fork_out: | |
2701 | return retval; | |
2702 | --- linux/kernel/softirq.c.orig Tue Feb 5 13:51:47 2002 | |
2703 | +++ linux/kernel/softirq.c Tue Feb 5 13:52:12 2002 | |
2704 | @@ -259,10 +259,9 @@ | |
2705 | ||
2706 | while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { | |
2707 | current->state = TASK_RUNNING; | |
2708 | - do { | |
2709 | - current->policy |= SCHED_YIELD; | |
2710 | - schedule(); | |
2711 | - } while (test_bit(TASKLET_STATE_SCHED, &t->state)); | |
2712 | + do | |
2713 | + sys_sched_yield(); | |
2714 | + while (test_bit(TASKLET_STATE_SCHED, &t->state)); | |
2715 | } | |
2716 | tasklet_unlock_wait(t); | |
2717 | clear_bit(TASKLET_STATE_SCHED, &t->state); | |
2718 | @@ -365,13 +364,13 @@ | |
2719 | int cpu = cpu_logical_map(bind_cpu); | |
2720 | ||
2721 | daemonize(); | |
2722 | - current->nice = 19; | |
2723 | + set_user_nice(current, 19); | |
2724 | sigfillset(¤t->blocked); | |
2725 | ||
2726 | /* Migrate to the right CPU */ | |
2727 | - current->cpus_allowed = 1UL << cpu; | |
2728 | - while (smp_processor_id() != cpu) | |
2729 | - schedule(); | |
2730 | + set_cpus_allowed(current, 1UL << cpu); | |
2731 | + if (cpu() != cpu) | |
2732 | + BUG(); | |
2733 | ||
2734 | sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu); | |
2735 | ||
2736 | @@ -396,7 +395,7 @@ | |
2737 | } | |
2738 | } | |
2739 | ||
2740 | -static __init int spawn_ksoftirqd(void) | |
2741 | +__init int spawn_ksoftirqd(void) | |
2742 | { | |
2743 | int cpu; | |
2744 | ||
2745 | @@ -405,14 +404,12 @@ | |
2746 | CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0) | |
2747 | printk("spawn_ksoftirqd() failed for cpu %d\n", cpu); | |
2748 | else { | |
2749 | - while (!ksoftirqd_task(cpu_logical_map(cpu))) { | |
2750 | - current->policy |= SCHED_YIELD; | |
2751 | - schedule(); | |
2752 | - } | |
2753 | + while (!ksoftirqd_task(cpu_logical_map(cpu))) | |
2754 | + sys_sched_yield(); | |
2755 | } | |
2756 | } | |
2757 | ||
2758 | return 0; | |
2759 | } | |
2760 | ||
2761 | -__initcall(spawn_ksoftirqd); | |
2762 | +__initcall(spawn_ksoftirqd); | |
2763 | --- linux/kernel/ptrace.c.orig Tue Feb 5 13:51:53 2002 | |
2764 | +++ linux/kernel/ptrace.c Tue Feb 5 13:52:12 2002 | |
2765 | @@ -31,20 +31,7 @@ | |
2766 | if (child->state != TASK_STOPPED) | |
2767 | return -ESRCH; | |
2768 | #ifdef CONFIG_SMP | |
2769 | - /* Make sure the child gets off its CPU.. */ | |
2770 | - for (;;) { | |
2771 | - task_lock(child); | |
2772 | - if (!task_has_cpu(child)) | |
2773 | - break; | |
2774 | - task_unlock(child); | |
2775 | - do { | |
2776 | - if (child->state != TASK_STOPPED) | |
2777 | - return -ESRCH; | |
2778 | - barrier(); | |
2779 | - cpu_relax(); | |
2780 | - } while (task_has_cpu(child)); | |
2781 | - } | |
2782 | - task_unlock(child); | |
2783 | + wait_task_inactive(child); | |
2784 | #endif | |
2785 | } | |
2786 | ||
2787 | --- linux/kernel/sys.c.orig Tue Feb 5 13:51:53 2002 | |
2788 | +++ linux/kernel/sys.c Tue Feb 5 13:52:12 2002 | |
2789 | @@ -220,10 +220,10 @@ | |
2790 | } | |
2791 | if (error == -ESRCH) | |
2792 | error = 0; | |
2793 | - if (niceval < p->nice && !capable(CAP_SYS_NICE)) | |
2794 | + if (niceval < task_nice(p) && !capable(CAP_SYS_NICE)) | |
2795 | error = -EACCES; | |
2796 | else | |
2797 | - p->nice = niceval; | |
2798 | + set_user_nice(p, niceval); | |
2799 | } | |
2800 | read_unlock(&tasklist_lock); | |
2801 | ||
2802 | @@ -249,7 +249,7 @@ | |
2803 | long niceval; | |
2804 | if (!proc_sel(p, which, who)) | |
2805 | continue; | |
2806 | - niceval = 20 - p->nice; | |
2807 | + niceval = 20 - task_nice(p); | |
2808 | if (niceval > retval) | |
2809 | retval = niceval; | |
2810 | } | |
2811 | --- linux/kernel/signal.c.orig Tue Feb 5 13:51:49 2002 | |
2812 | +++ linux/kernel/signal.c Tue Feb 5 13:52:12 2002 | |
2813 | @@ -478,12 +478,9 @@ | |
2814 | * process of changing - but no harm is done by that | |
2815 | * other than doing an extra (lightweight) IPI interrupt. | |
2816 | */ | |
2817 | - spin_lock(&runqueue_lock); | |
2818 | - if (task_has_cpu(t) && t->processor != smp_processor_id()) | |
2819 | - smp_send_reschedule(t->processor); | |
2820 | - spin_unlock(&runqueue_lock); | |
2821 | -#endif /* CONFIG_SMP */ | |
2822 | - | |
2823 | + if ((t->state == TASK_RUNNING) && (t->cpu != cpu())) | |
2824 | + kick_if_running(t); | |
2825 | +#endif | |
2826 | if (t->state & TASK_INTERRUPTIBLE) { | |
2827 | wake_up_process(t); | |
2828 | return; | |
2829 | --- linux/kernel/printk.c.orig Tue Feb 5 13:51:53 2002 | |
2830 | +++ linux/kernel/printk.c Tue Feb 5 13:52:12 2002 | |
2831 | @@ -26,6 +26,7 @@ | |
2832 | #include <linux/module.h> | |
2833 | #include <linux/interrupt.h> /* For in_interrupt() */ | |
2834 | #include <linux/config.h> | |
2835 | +#include <linux/delay.h> | |
2836 | ||
2837 | #include <asm/uaccess.h> | |
2838 | ||
2839 | --- linux/kernel/ksyms.c.orig Tue Feb 5 13:51:53 2002 | |
2840 | +++ linux/kernel/ksyms.c Tue Feb 5 13:52:12 2002 | |
2841 | @@ -437,6 +437,9 @@ | |
2842 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); | |
2843 | EXPORT_SYMBOL(schedule); | |
2844 | EXPORT_SYMBOL(schedule_timeout); | |
2845 | +EXPORT_SYMBOL(sys_sched_yield); | |
2846 | +EXPORT_SYMBOL(set_user_nice); | |
2847 | +EXPORT_SYMBOL(set_cpus_allowed); | |
2848 | EXPORT_SYMBOL(jiffies); | |
2849 | EXPORT_SYMBOL(xtime); | |
2850 | EXPORT_SYMBOL(do_gettimeofday); | |
2851 | @@ -448,6 +451,7 @@ | |
2852 | ||
2853 | EXPORT_SYMBOL(kstat); | |
2854 | EXPORT_SYMBOL(nr_running); | |
2855 | +EXPORT_SYMBOL(nr_context_switches); | |
2856 | ||
2857 | /* misc */ | |
2858 | EXPORT_SYMBOL(panic); | |
2859 | --- linux/mm/oom_kill.c.orig Tue Feb 5 13:51:47 2002 | |
2860 | +++ linux/mm/oom_kill.c Tue Feb 5 13:52:12 2002 | |
2861 | @@ -82,7 +82,7 @@ | |
2862 | * Niced processes are most likely less important, so double | |
2863 | * their badness points. | |
2864 | */ | |
2865 | - if (p->nice > 0) | |
2866 | + if (task_nice(p) > 0) | |
2867 | points *= 2; | |
2868 | ||
2869 | /* | |
2870 | @@ -149,7 +149,7 @@ | |
2871 | * all the memory it needs. That way it should be able to | |
2872 | * exit() and clear out its resources quickly... | |
2873 | */ | |
2874 | - p->counter = 5 * HZ; | |
2875 | + p->time_slice = HZ; | |
2876 | p->flags |= PF_MEMALLOC | PF_MEMDIE; | |
2877 | ||
2878 | /* This process has hardware access, be more careful. */ | |
2879 | @@ -188,8 +188,7 @@ | |
2880 | * killing itself before someone else gets the chance to ask | |
2881 | * for more memory. | |
2882 | */ | |
2883 | - current->policy |= SCHED_YIELD; | |
2884 | - schedule(); | |
2885 | + yield(); | |
2886 | return; | |
2887 | } | |
2888 | ||
2889 | --- linux/mm/page_alloc.c.orig Tue Feb 5 13:51:53 2002 | |
2890 | +++ linux/mm/page_alloc.c Tue Feb 5 13:52:12 2002 | |
2891 | @@ -400,9 +400,8 @@ | |
2892 | return NULL; | |
2893 | ||
2894 | /* Yield for kswapd, and try again */ | |
2895 | - current->policy |= SCHED_YIELD; | |
2896 | __set_current_state(TASK_RUNNING); | |
2897 | - schedule(); | |
2898 | + yield(); | |
2899 | goto rebalance; | |
2900 | } | |
2901 | ||
2902 | --- linux/mm/highmem.c.orig Tue Feb 5 13:51:51 2002 | |
2903 | +++ linux/mm/highmem.c Tue Feb 5 13:52:12 2002 | |
2904 | @@ -354,9 +354,8 @@ | |
2905 | /* we need to wait I/O completion */ | |
2906 | run_task_queue(&tq_disk); | |
2907 | ||
2908 | - current->policy |= SCHED_YIELD; | |
2909 | __set_current_state(TASK_RUNNING); | |
2910 | - schedule(); | |
2911 | + yield(); | |
2912 | goto repeat_alloc; | |
2913 | } | |
2914 | ||
2915 | @@ -392,9 +391,8 @@ | |
2916 | /* we need to wait I/O completion */ | |
2917 | run_task_queue(&tq_disk); | |
2918 | ||
2919 | - current->policy |= SCHED_YIELD; | |
2920 | __set_current_state(TASK_RUNNING); | |
2921 | - schedule(); | |
2922 | + yield(); | |
2923 | goto repeat_alloc; | |
2924 | } | |
2925 | ||
2926 | --- linux/include/linux/sched.h.orig Tue Feb 5 13:51:51 2002 | |
2927 | +++ linux/include/linux/sched.h Tue Feb 5 13:52:12 2002 | |
2928 | @@ -6,6 +6,7 @@ | |
2929 | extern unsigned long event; | |
2930 | ||
2931 | #include <linux/config.h> | |
2932 | +#include <linux/compiler.h> | |
2933 | #include <linux/binfmts.h> | |
2934 | #include <linux/threads.h> | |
2935 | #include <linux/kernel.h> | |
2936 | @@ -42,6 +43,7 @@ | |
2937 | #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ | |
2938 | #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ | |
2939 | #define CLONE_THREAD 0x00010000 /* Same thread group? */ | |
2940 | +#define CLONE_NEWNS 0x00020000 /* New namespace group? */ | |
2941 | ||
2942 | #define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD) | |
2943 | ||
2944 | @@ -72,8 +74,9 @@ | |
2945 | #define CT_TO_SECS(x) ((x) / HZ) | |
2946 | #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) | |
2947 | ||
2948 | -extern int nr_running, nr_threads; | |
2949 | +extern int nr_threads; | |
2950 | extern int last_pid; | |
2951 | +extern unsigned long nr_running(void); | |
2952 | ||
2953 | #include <linux/fs.h> | |
2954 | #include <linux/time.h> | |
2955 | @@ -116,12 +119,6 @@ | |
2956 | #define SCHED_FIFO 1 | |
2957 | #define SCHED_RR 2 | |
2958 | ||
2959 | -/* | |
2960 | - * This is an additional bit set when we want to | |
2961 | - * yield the CPU for one re-schedule.. | |
2962 | - */ | |
2963 | -#define SCHED_YIELD 0x10 | |
2964 | - | |
2965 | struct sched_param { | |
2966 | int sched_priority; | |
2967 | }; | |
2968 | @@ -139,17 +136,22 @@ | |
2969 | * a separate lock). | |
2970 | */ | |
2971 | extern rwlock_t tasklist_lock; | |
2972 | -extern spinlock_t runqueue_lock; | |
2973 | extern spinlock_t mmlist_lock; | |
2974 | ||
2975 | +typedef struct task_struct task_t; | |
2976 | + | |
2977 | extern void sched_init(void); | |
2978 | -extern void init_idle(void); | |
2979 | +extern void init_idle(task_t *idle, int cpu); | |
2980 | extern void show_state(void); | |
2981 | extern void cpu_init (void); | |
2982 | extern void trap_init(void); | |
2983 | extern void update_process_times(int user); | |
2984 | -extern void update_one_process(struct task_struct *p, unsigned long user, | |
2985 | +extern void update_one_process(task_t *p, unsigned long user, | |
2986 | unsigned long system, int cpu); | |
2987 | +extern void scheduler_tick(int user_tick, int system); | |
2988 | +extern void sched_task_migrated(task_t *p); | |
2989 | +extern void smp_migrate_task(int cpu, task_t *task); | |
2990 | +extern unsigned long cache_decay_ticks; | |
2991 | ||
2992 | #define MAX_SCHEDULE_TIMEOUT LONG_MAX | |
2993 | extern signed long FASTCALL(schedule_timeout(signed long timeout)); | |
2994 | @@ -166,6 +168,7 @@ | |
2995 | */ | |
2996 | #define NR_OPEN_DEFAULT BITS_PER_LONG | |
2997 | ||
2998 | +struct namespace; | |
2999 | /* | |
3000 | * Open file table structure | |
3001 | */ | |
3002 | @@ -278,6 +281,8 @@ | |
3003 | extern struct user_struct root_user; | |
3004 | #define INIT_USER (&root_user) | |
3005 | ||
3006 | +typedef struct prio_array prio_array_t; | |
3007 | + | |
3008 | struct task_struct { | |
3009 | /* | |
3010 | * offsets of these are hardcoded elsewhere - touch with care | |
3011 | @@ -295,35 +300,26 @@ | |
3012 | ||
3013 | int lock_depth; /* Lock depth */ | |
3014 | ||
3015 | -/* | |
3016 | - * offset 32 begins here on 32-bit platforms. We keep | |
3017 | - * all fields in a single cacheline that are needed for | |
3018 | - * the goodness() loop in schedule(). | |
3019 | - */ | |
3020 | - long counter; | |
3021 | - long nice; | |
3022 | - unsigned long policy; | |
3023 | - struct mm_struct *mm; | |
3024 | - int processor; | |
3025 | /* | |
3026 | - * cpus_runnable is ~0 if the process is not running on any | |
3027 | - * CPU. It's (1 << cpu) if it's running on a CPU. This mask | |
3028 | - * is updated under the runqueue lock. | |
3029 | - * | |
3030 | - * To determine whether a process might run on a CPU, this | |
3031 | - * mask is AND-ed with cpus_allowed. | |
3032 | + * offset 32 begins here on 32-bit platforms. | |
3033 | */ | |
3034 | - unsigned long cpus_runnable, cpus_allowed; | |
3035 | - /* | |
3036 | - * (only the 'next' pointer fits into the cacheline, but | |
3037 | - * that's just fine.) | |
3038 | - */ | |
3039 | - struct list_head run_list; | |
3040 | - unsigned long sleep_time; | |
3041 | + unsigned int cpu; | |
3042 | + int prio, static_prio; | |
3043 | + list_t run_list; | |
3044 | + prio_array_t *array; | |
3045 | + | |
3046 | + unsigned long sleep_avg; | |
3047 | + unsigned long sleep_timestamp; | |
3048 | + | |
3049 | + unsigned long policy; | |
3050 | + unsigned long cpus_allowed; | |
3051 | + unsigned int time_slice; | |
3052 | + | |
3053 | + task_t *next_task, *prev_task; | |
3054 | ||
3055 | - struct task_struct *next_task, *prev_task; | |
3056 | - struct mm_struct *active_mm; | |
3057 | + struct mm_struct *mm, *active_mm; | |
3058 | struct list_head local_pages; | |
3059 | + | |
3060 | unsigned int allocation_order, nr_local_pages; | |
3061 | ||
3062 | /* task state */ | |
3063 | @@ -345,12 +341,12 @@ | |
3064 | * older sibling, respectively. (p->father can be replaced with | |
3065 | * p->p_pptr->pid) | |
3066 | */ | |
3067 | - struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr; | |
3068 | + task_t *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr; | |
3069 | struct list_head thread_group; | |
3070 | ||
3071 | /* PID hash table linkage. */ | |
3072 | - struct task_struct *pidhash_next; | |
3073 | - struct task_struct **pidhash_pprev; | |
3074 | + task_t *pidhash_next; | |
3075 | + task_t **pidhash_pprev; | |
3076 | ||
3077 | wait_queue_head_t wait_chldexit; /* for wait4() */ | |
3078 | struct completion *vfork_done; /* for vfork() */ | |
3079 | @@ -389,6 +385,8 @@ | |
3080 | struct fs_struct *fs; | |
3081 | /* open file information */ | |
3082 | struct files_struct *files; | |
3083 | +/* namespace */ | |
3084 | + struct namespace *namespace; | |
3085 | /* signal handlers */ | |
3086 | spinlock_t sigmask_lock; /* Protects signal and blocked */ | |
3087 | struct signal_struct *sig; | |
3088 | @@ -446,10 +444,13 @@ | |
3089 | */ | |
3090 | #define _STK_LIM (8*1024*1024) | |
3091 | ||
3092 | -#define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */ | |
3093 | -#define MAX_COUNTER (20*HZ/100) | |
3094 | -#define DEF_NICE (0) | |
3095 | +extern void set_cpus_allowed(task_t *p, unsigned long new_mask); | |
3096 | +extern void set_user_nice(task_t *p, long nice); | |
3097 | +extern int task_prio(task_t *p); | |
3098 | +extern int task_nice(task_t *p); | |
3099 | ||
3100 | +asmlinkage long sys_sched_yield(void); | |
3101 | +#define yield() sys_sched_yield() | |
3102 | ||
3103 | /* | |
3104 | * The default (Linux) execution domain. | |
3105 | @@ -468,14 +469,14 @@ | |
3106 | addr_limit: KERNEL_DS, \ | |
3107 | exec_domain: &default_exec_domain, \ | |
3108 | lock_depth: -1, \ | |
3109 | - counter: DEF_COUNTER, \ | |
3110 | - nice: DEF_NICE, \ | |
3111 | + prio: 120, \ | |
3112 | + static_prio: 120, \ | |
3113 | policy: SCHED_OTHER, \ | |
3114 | + cpus_allowed: -1, \ | |
3115 | mm: NULL, \ | |
3116 | active_mm: &init_mm, \ | |
3117 | - cpus_runnable: -1, \ | |
3118 | - cpus_allowed: -1, \ | |
3119 | run_list: LIST_HEAD_INIT(tsk.run_list), \ | |
3120 | + time_slice: HZ, \ | |
3121 | next_task: &tsk, \ | |
3122 | prev_task: &tsk, \ | |
3123 | p_opptr: &tsk, \ | |
3124 | @@ -509,24 +510,24 @@ | |
3125 | #endif | |
3126 | ||
3127 | union task_union { | |
3128 | - struct task_struct task; | |
3129 | + task_t task; | |
3130 | unsigned long stack[INIT_TASK_SIZE/sizeof(long)]; | |
3131 | }; | |
3132 | ||
3133 | extern union task_union init_task_union; | |
3134 | ||
3135 | extern struct mm_struct init_mm; | |
3136 | -extern struct task_struct *init_tasks[NR_CPUS]; | |
3137 | +extern task_t *init_tasks[NR_CPUS]; | |
3138 | ||
3139 | /* PID hashing. (shouldnt this be dynamic?) */ | |
3140 | #define PIDHASH_SZ (4096 >> 2) | |
3141 | -extern struct task_struct *pidhash[PIDHASH_SZ]; | |
3142 | +extern task_t *pidhash[PIDHASH_SZ]; | |
3143 | ||
3144 | #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1)) | |
3145 | ||
3146 | -static inline void hash_pid(struct task_struct *p) | |
3147 | +static inline void hash_pid(task_t *p) | |
3148 | { | |
3149 | - struct task_struct **htable = &pidhash[pid_hashfn(p->pid)]; | |
3150 | + task_t **htable = &pidhash[pid_hashfn(p->pid)]; | |
3151 | ||
3152 | if((p->pidhash_next = *htable) != NULL) | |
3153 | (*htable)->pidhash_pprev = &p->pidhash_next; | |
3154 | @@ -534,16 +535,16 @@ | |
3155 | p->pidhash_pprev = htable; | |
3156 | } | |
3157 | ||
3158 | -static inline void unhash_pid(struct task_struct *p) | |
3159 | +static inline void unhash_pid(task_t *p) | |
3160 | { | |
3161 | if(p->pidhash_next) | |
3162 | p->pidhash_next->pidhash_pprev = p->pidhash_pprev; | |
3163 | *p->pidhash_pprev = p->pidhash_next; | |
3164 | } | |
3165 | ||
3166 | -static inline struct task_struct *find_task_by_pid(int pid) | |
3167 | +static inline task_t *find_task_by_pid(int pid) | |
3168 | { | |
3169 | - struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)]; | |
3170 | + task_t *p, **htable = &pidhash[pid_hashfn(pid)]; | |
3171 | ||
3172 | for(p = *htable; p && p->pid != pid; p = p->pidhash_next) | |
3173 | ; | |
3174 | @@ -551,19 +552,6 @@ | |
3175 | return p; | |
3176 | } | |
3177 | ||
3178 | -#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL) | |
3179 | - | |
3180 | -static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu) | |
3181 | -{ | |
3182 | - tsk->processor = cpu; | |
3183 | - tsk->cpus_runnable = 1UL << cpu; | |
3184 | -} | |
3185 | - | |
3186 | -static inline void task_release_cpu(struct task_struct *tsk) | |
3187 | -{ | |
3188 | - tsk->cpus_runnable = ~0UL; | |
3189 | -} | |
3190 | - | |
3191 | /* per-UID process charging. */ | |
3192 | extern struct user_struct * alloc_uid(uid_t); | |
3193 | extern void free_uid(struct user_struct *); | |
3194 | @@ -590,7 +578,9 @@ | |
3195 | extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q)); | |
3196 | extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q, | |
3197 | signed long timeout)); | |
3198 | -extern int FASTCALL(wake_up_process(struct task_struct * tsk)); | |
3199 | +extern int FASTCALL(wake_up_process(task_t * tsk)); | |
3200 | +extern void FASTCALL(wake_up_forked_process(task_t * tsk)); | |
3201 | +extern void FASTCALL(sched_exit(task_t * p)); | |
3202 | ||
3203 | #define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) | |
3204 | #define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) | |
3205 | @@ -608,28 +598,28 @@ | |
3206 | extern int in_egroup_p(gid_t); | |
3207 | ||
3208 | extern void proc_caches_init(void); | |
3209 | -extern void flush_signals(struct task_struct *); | |
3210 | -extern void flush_signal_handlers(struct task_struct *); | |
3211 | +extern void flush_signals(task_t *); | |
3212 | +extern void flush_signal_handlers(task_t *); | |
3213 | extern int dequeue_signal(sigset_t *, siginfo_t *); | |
3214 | extern void block_all_signals(int (*notifier)(void *priv), void *priv, | |
3215 | sigset_t *mask); | |
3216 | extern void unblock_all_signals(void); | |
3217 | -extern int send_sig_info(int, struct siginfo *, struct task_struct *); | |
3218 | -extern int force_sig_info(int, struct siginfo *, struct task_struct *); | |
3219 | +extern int send_sig_info(int, struct siginfo *, task_t *); | |
3220 | +extern int force_sig_info(int, struct siginfo *, task_t *); | |
3221 | extern int kill_pg_info(int, struct siginfo *, pid_t); | |
3222 | extern int kill_sl_info(int, struct siginfo *, pid_t); | |
3223 | extern int kill_proc_info(int, struct siginfo *, pid_t); | |
3224 | -extern void notify_parent(struct task_struct *, int); | |
3225 | -extern void do_notify_parent(struct task_struct *, int); | |
3226 | -extern void force_sig(int, struct task_struct *); | |
3227 | -extern int send_sig(int, struct task_struct *, int); | |
3228 | +extern void notify_parent(task_t *, int); | |
3229 | +extern void do_notify_parent(task_t *, int); | |
3230 | +extern void force_sig(int, task_t *); | |
3231 | +extern int send_sig(int, task_t *, int); | |
3232 | extern int kill_pg(pid_t, int, int); | |
3233 | extern int kill_sl(pid_t, int, int); | |
3234 | extern int kill_proc(pid_t, int, int); | |
3235 | extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *); | |
3236 | extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long); | |
3237 | ||
3238 | -static inline int signal_pending(struct task_struct *p) | |
3239 | +static inline int signal_pending(task_t *p) | |
3240 | { | |
3241 | return (p->sigpending != 0); | |
3242 | } | |
3243 | @@ -668,7 +658,7 @@ | |
3244 | This is required every time the blocked sigset_t changes. | |
3245 | All callers should have t->sigmask_lock. */ | |
3246 | ||
3247 | -static inline void recalc_sigpending(struct task_struct *t) | |
3248 | +static inline void recalc_sigpending(task_t *t) | |
3249 | { | |
3250 | t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked); | |
3251 | } | |
3252 | @@ -775,16 +765,17 @@ | |
3253 | extern int expand_fdset(struct files_struct *, int nr); | |
3254 | extern void free_fdset(fd_set *, int); | |
3255 | ||
3256 | -extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); | |
3257 | +extern int copy_thread(int, unsigned long, unsigned long, unsigned long, task_t *, struct pt_regs *); | |
3258 | extern void flush_thread(void); | |
3259 | extern void exit_thread(void); | |
3260 | ||
3261 | -extern void exit_mm(struct task_struct *); | |
3262 | -extern void exit_files(struct task_struct *); | |
3263 | -extern void exit_sighand(struct task_struct *); | |
3264 | +extern void exit_mm(task_t *); | |
3265 | +extern void exit_files(task_t *); | |
3266 | +extern void exit_sighand(task_t *); | |
3267 | ||
3268 | extern void reparent_to_init(void); | |
3269 | extern void daemonize(void); | |
3270 | +extern task_t *child_reaper; | |
3271 | ||
3272 | extern int do_execve(char *, char **, char **, struct pt_regs *); | |
3273 | extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long); | |
3274 | @@ -793,6 +784,9 @@ | |
3275 | extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); | |
3276 | extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); | |
3277 | ||
3278 | +extern void wait_task_inactive(task_t * p); | |
3279 | +extern void kick_if_running(task_t * p); | |
3280 | + | |
3281 | #define __wait_event(wq, condition) \ | |
3282 | do { \ | |
3283 | wait_queue_t __wait; \ | |
3284 | @@ -871,24 +865,10 @@ | |
3285 | for (p = &init_task ; (p = p->next_task) != &init_task ; ) | |
3286 | ||
3287 | #define next_thread(p) \ | |
3288 | - list_entry((p)->thread_group.next, struct task_struct, thread_group) | |
3289 | - | |
3290 | -static inline void del_from_runqueue(struct task_struct * p) | |
3291 | -{ | |
3292 | - nr_running--; | |
3293 | - p->sleep_time = jiffies; | |
3294 | - list_del(&p->run_list); | |
3295 | - p->run_list.next = NULL; | |
3296 | -} | |
3297 | - | |
3298 | -static inline int task_on_runqueue(struct task_struct *p) | |
3299 | -{ | |
3300 | - return (p->run_list.next != NULL); | |
3301 | -} | |
3302 | + list_entry((p)->thread_group.next, task_t, thread_group) | |
3303 | ||
3304 | -static inline void unhash_process(struct task_struct *p) | |
3305 | +static inline void unhash_process(task_t *p) | |
3306 | { | |
3307 | - if (task_on_runqueue(p)) BUG(); | |
3308 | write_lock_irq(&tasklist_lock); | |
3309 | nr_threads--; | |
3310 | unhash_pid(p); | |
3311 | @@ -898,12 +878,12 @@ | |
3312 | } | |
3313 | ||
3314 | /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */ | |
3315 | -static inline void task_lock(struct task_struct *p) | |
3316 | +static inline void task_lock(task_t *p) | |
3317 | { | |
3318 | spin_lock(&p->alloc_lock); | |
3319 | } | |
3320 | ||
3321 | -static inline void task_unlock(struct task_struct *p) | |
3322 | +static inline void task_unlock(task_t *p) | |
3323 | { | |
3324 | spin_unlock(&p->alloc_lock); | |
3325 | } | |
3326 | --- linux/include/linux/list.h.orig Tue Feb 5 13:51:51 2002 | |
3327 | +++ linux/include/linux/list.h Tue Feb 5 13:52:12 2002 | |
3328 | @@ -19,6 +19,8 @@ | |
3329 | struct list_head *next, *prev; | |
3330 | }; | |
3331 | ||
3332 | +typedef struct list_head list_t; | |
3333 | + | |
3334 | #define LIST_HEAD_INIT(name) { &(name), &(name) } | |
3335 | ||
3336 | #define LIST_HEAD(name) \ | |
3337 | --- linux/include/linux/kernel_stat.h.orig Tue Aug 21 14:26:23 2001 | |
3338 | +++ linux/include/linux/kernel_stat.h Tue Feb 5 13:52:12 2002 | |
3339 | @@ -32,10 +32,11 @@ | |
3340 | unsigned int ipackets, opackets; | |
3341 | unsigned int ierrors, oerrors; | |
3342 | unsigned int collisions; | |
3343 | - unsigned int context_swtch; | |
3344 | }; | |
3345 | ||
3346 | extern struct kernel_stat kstat; | |
3347 | + | |
3348 | +extern unsigned long nr_context_switches(void); | |
3349 | ||
3350 | #if !defined(CONFIG_ARCH_S390) | |
3351 | /* | |
3352 | --- linux/include/linux/smp.h.orig Sun Dec 31 20:10:17 2000 | |
3353 | +++ linux/include/linux/smp.h Tue Feb 5 13:52:12 2002 | |
3354 | @@ -86,6 +86,14 @@ | |
3355 | #define cpu_number_map(cpu) 0 | |
3356 | #define smp_call_function(func,info,retry,wait) ({ 0; }) | |
3357 | #define cpu_online_map 1 | |
3358 | +static inline void smp_send_reschedule(int cpu) { } | |
3359 | +static inline void smp_send_reschedule_all(void) { } | |
3360 | ||
3361 | #endif | |
3362 | + | |
3363 | +/* | |
3364 | + * Common definitions: | |
3365 | + */ | |
3366 | +#define cpu() smp_processor_id() | |
3367 | + | |
3368 | #endif | |
3369 | --- linux/include/asm-i386/smp.h.orig Tue Feb 5 13:51:51 2002 | |
3370 | +++ linux/include/asm-i386/smp.h Tue Feb 5 13:52:12 2002 | |
3371 | @@ -63,6 +63,7 @@ | |
3372 | extern void smp_flush_tlb(void); | |
3373 | extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); | |
3374 | extern void smp_send_reschedule(int cpu); | |
3375 | +extern void smp_send_reschedule_all(void); | |
3376 | extern void smp_invalidate_rcv(void); /* Process an NMI */ | |
3377 | extern void (*mtrr_hook) (void); | |
3378 | extern void zap_low_mappings (void); | |
3379 | @@ -104,7 +105,7 @@ | |
3380 | * so this is correct in the x86 case. | |
3381 | */ | |
3382 | ||
3383 | -#define smp_processor_id() (current->processor) | |
3384 | +#define smp_processor_id() (current->cpu) | |
3385 | ||
3386 | static __inline int hard_smp_processor_id(void) | |
3387 | { | |
3388 | @@ -121,18 +122,6 @@ | |
3389 | #endif /* !__ASSEMBLY__ */ | |
3390 | ||
3391 | #define NO_PROC_ID 0xFF /* No processor magic marker */ | |
3392 | - | |
3393 | -/* | |
3394 | - * This magic constant controls our willingness to transfer | |
3395 | - * a process across CPUs. Such a transfer incurs misses on the L1 | |
3396 | - * cache, and on a P6 or P5 with multiple L2 caches L2 hits. My | |
3397 | - * gut feeling is this will vary by board in value. For a board | |
3398 | - * with separate L2 cache it probably depends also on the RSS, and | |
3399 | - * for a board with shared L2 cache it ought to decay fast as other | |
3400 | - * processes are run. | |
3401 | - */ | |
3402 | - | |
3403 | -#define PROC_CHANGE_PENALTY 15 /* Schedule penalty */ | |
3404 | ||
3405 | #endif | |
3406 | #endif | |
3407 | --- linux/include/asm-i386/bitops.h.orig Tue Aug 21 14:26:16 2001 | |
3408 | +++ linux/include/asm-i386/bitops.h Tue Feb 5 13:52:12 2002 | |
3409 | @@ -75,6 +75,14 @@ | |
3410 | :"=m" (ADDR) | |
3411 | :"Ir" (nr)); | |
3412 | } | |
3413 | + | |
3414 | +static __inline__ void __clear_bit(int nr, volatile void * addr) | |
3415 | +{ | |
3416 | + __asm__ __volatile__( | |
3417 | + "btrl %1,%0" | |
3418 | + :"=m" (ADDR) | |
3419 | + :"Ir" (nr)); | |
3420 | +} | |
3421 | #define smp_mb__before_clear_bit() barrier() | |
3422 | #define smp_mb__after_clear_bit() barrier() | |
3423 | ||
3424 | @@ -284,6 +292,34 @@ | |
3425 | } | |
3426 | ||
3427 | /** | |
3428 | + * find_first_bit - find the first set bit in a memory region | |
3429 | + * @addr: The address to start the search at | |
3430 | + * @size: The maximum size to search | |
3431 | + * | |
3432 | + * Returns the bit-number of the first set bit, not the number of the byte | |
3433 | + * containing a bit. | |
3434 | + */ | |
3435 | +static __inline__ int find_first_bit(void * addr, unsigned size) | |
3436 | +{ | |
3437 | + int d0, d1; | |
3438 | + int res; | |
3439 | + | |
3440 | + /* This looks at memory. Mark it volatile to tell gcc not to move it around */ | |
3441 | + __asm__ __volatile__( | |
3442 | + "xorl %%eax,%%eax\n\t" | |
3443 | + "repe; scasl\n\t" | |
3444 | + "jz 1f\n\t" | |
3445 | + "leal -4(%%edi),%%edi\n\t" | |
3446 | + "bsfl (%%edi),%%eax\n" | |
3447 | + "1:\tsubl %%ebx,%%edi\n\t" | |
3448 | + "shll $3,%%edi\n\t" | |
3449 | + "addl %%edi,%%eax" | |
3450 | + :"=a" (res), "=&c" (d0), "=&D" (d1) | |
3451 | + :"1" ((size + 31) >> 5), "2" (addr), "b" (addr)); | |
3452 | + return res; | |
3453 | +} | |
3454 | + | |
3455 | +/** | |
3456 | * find_next_zero_bit - find the first zero bit in a memory region | |
3457 | * @addr: The address to base the search on | |
3458 | * @offset: The bitnumber to start searching at | |
3459 | @@ -296,7 +332,7 @@ | |
3460 | ||
3461 | if (bit) { | |
3462 | /* | |
3463 | - * Look for zero in first byte | |
3464 | + * Look for zero in the first 32 bits. | |
3465 | */ | |
3466 | __asm__("bsfl %1,%0\n\t" | |
3467 | "jne 1f\n\t" | |
3468 | @@ -317,6 +353,39 @@ | |
3469 | } | |
3470 | ||
3471 | /** | |
3472 | + * find_next_bit - find the first set bit in a memory region | |
3473 | + * @addr: The address to base the search on | |
3474 | + * @offset: The bitnumber to start searching at | |
3475 | + * @size: The maximum size to search | |
3476 | + */ | |
3477 | +static __inline__ int find_next_bit (void * addr, int size, int offset) | |
3478 | +{ | |
3479 | + unsigned long * p = ((unsigned long *) addr) + (offset >> 5); | |
3480 | + int set = 0, bit = offset & 31, res; | |
3481 | + | |
3482 | + if (bit) { | |
3483 | + /* | |
3484 | + * Look for nonzero in the first 32 bits: | |
3485 | + */ | |
3486 | + __asm__("bsfl %1,%0\n\t" | |
3487 | + "jne 1f\n\t" | |
3488 | + "movl $32, %0\n" | |
3489 | + "1:" | |
3490 | + : "=r" (set) | |
3491 | + : "r" (*p >> bit)); | |
3492 | + if (set < (32 - bit)) | |
3493 | + return set + offset; | |
3494 | + set = 32 - bit; | |
3495 | + p++; | |
3496 | + } | |
3497 | + /* | |
3498 | + * No set bit yet, search remaining full words for a bit | |
3499 | + */ | |
3500 | + res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr)); | |
3501 | + return (offset + set + res); | |
3502 | +} | |
3503 | + | |
3504 | +/** | |
3505 | * ffz - find first zero in word. | |
3506 | * @word: The word to search | |
3507 | * | |
3508 | @@ -327,6 +396,20 @@ | |
3509 | __asm__("bsfl %1,%0" | |
3510 | :"=r" (word) | |
3511 | :"r" (~word)); | |
3512 | + return word; | |
3513 | +} | |
3514 | + | |
3515 | +/** | |
3516 | + * __ffs - find first bit in word. | |
3517 | + * @word: The word to search | |
3518 | + * | |
3519 | + * Undefined if no bit exists, so code should check against 0 first. | |
3520 | + */ | |
3521 | +static __inline__ unsigned long __ffs(unsigned long word) | |
3522 | +{ | |
3523 | + __asm__("bsfl %1,%0" | |
3524 | + :"=r" (word) | |
3525 | + :"rm" (word)); | |
3526 | return word; | |
3527 | } | |
3528 | ||
3529 | --- linux/include/asm-i386/pgalloc.h.orig Tue Feb 5 13:51:51 2002 | |
3530 | +++ linux/include/asm-i386/pgalloc.h Tue Feb 5 13:52:12 2002 | |
3531 | @@ -224,6 +224,7 @@ | |
3532 | { | |
3533 | struct mm_struct *active_mm; | |
3534 | int state; | |
3535 | + char __cacheline_padding[24]; | |
3536 | }; | |
3537 | extern struct tlb_state cpu_tlbstate[NR_CPUS]; | |
3538 | ||
3539 | --- linux/include/asm-i386/mmu_context.h.orig Tue Aug 21 14:26:23 2001 | |
3540 | +++ linux/include/asm-i386/mmu_context.h Tue Feb 5 13:52:12 2002 | |
3541 | @@ -7,6 +7,25 @@ | |
3542 | #include <asm/pgalloc.h> | |
3543 | ||
3544 | /* | |
3545 | + * Every architecture must define this function. It's the fastest | |
3546 | + * way of searching a 140-bit bitmap where the first 100 bits are | |
3547 | + * unlikely to be set. It's guaranteed that at least one of the 140 | |
3548 | + * bits is cleared. | |
3549 | + */ | |
3550 | +static inline int sched_find_first_bit(unsigned long *b) | |
3551 | +{ | |
3552 | + if (unlikely(b[0])) | |
3553 | + return __ffs(b[0]); | |
3554 | + if (unlikely(b[1])) | |
3555 | + return __ffs(b[1]) + 32; | |
3556 | + if (unlikely(b[2])) | |
3557 | + return __ffs(b[2]) + 64; | |
3558 | + if (b[3]) | |
3559 | + return __ffs(b[3]) + 96; | |
3560 | + return __ffs(b[4]) + 128; | |
3561 | +} | |
3562 | + | |
3563 | +/* | |
3564 | * possibly do the LDT unload here? | |
3565 | */ | |
3566 | #define destroy_context(mm) do { } while(0) | |
3567 | @@ -27,13 +46,13 @@ | |
3568 | ||
3569 | static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu) | |
3570 | { | |
3571 | - if (prev != next) { | |
3572 | + if (likely(prev != next)) { | |
3573 | /* stop flush ipis for the previous mm */ | |
3574 | clear_bit(cpu, &prev->cpu_vm_mask); | |
3575 | /* | |
3576 | * Re-load LDT if necessary | |
3577 | */ | |
3578 | - if (prev->context.segments != next->context.segments) | |
3579 | + if (unlikely(prev->context.segments != next->context.segments)) | |
3580 | load_LDT(next); | |
3581 | #ifdef CONFIG_SMP | |
3582 | cpu_tlbstate[cpu].state = TLBSTATE_OK; | |
3583 | --- linux/include/asm-i386/hw_irq.h.orig Tue Feb 5 13:51:40 2002 | |
3584 | +++ linux/include/asm-i386/hw_irq.h Tue Feb 5 13:52:12 2002 | |
3585 | @@ -41,7 +41,8 @@ | |
3586 | #define ERROR_APIC_VECTOR 0xfe | |
3587 | #define INVALIDATE_TLB_VECTOR 0xfd | |
3588 | #define RESCHEDULE_VECTOR 0xfc | |
3589 | -#define CALL_FUNCTION_VECTOR 0xfb | |
3590 | +#define TASK_MIGRATION_VECTOR 0xfb | |
3591 | +#define CALL_FUNCTION_VECTOR 0xfa | |
3592 | ||
3593 | /* | |
3594 | * Local APIC timer IRQ vector is on a different priority level, | |
3595 | --- linux/include/asm-i386/apic.h.orig Tue Feb 5 13:51:43 2002 | |
3596 | +++ linux/include/asm-i386/apic.h Tue Feb 5 13:52:12 2002 | |
3597 | @@ -79,6 +79,8 @@ | |
3598 | extern void setup_apic_nmi_watchdog (void); | |
3599 | extern inline void nmi_watchdog_tick (struct pt_regs * regs); | |
3600 | extern int APIC_init_uniprocessor (void); | |
3601 | +extern void disable_APIC_timer(void); | |
3602 | +extern void enable_APIC_timer(void); | |
3603 | ||
3604 | extern struct pm_dev *apic_pm_register(pm_dev_t, unsigned long, pm_callback); | |
3605 | extern void apic_pm_unregister(struct pm_dev*); | |
3606 | --- linux/net/unix/af_unix.c.orig Tue Feb 5 13:51:53 2002 | |
3607 | +++ linux/net/unix/af_unix.c Tue Feb 5 13:52:12 2002 | |
3608 | @@ -565,10 +565,8 @@ | |
3609 | addr->hash)) { | |
3610 | write_unlock(&unix_table_lock); | |
3611 | /* Sanity yield. It is unusual case, but yet... */ | |
3612 | - if (!(ordernum&0xFF)) { | |
3613 | - current->policy |= SCHED_YIELD; | |
3614 | - schedule(); | |
3615 | - } | |
3616 | + if (!(ordernum&0xFF)) | |
3617 | + yield(); | |
3618 | goto retry; | |
3619 | } | |
3620 | addr->hash ^= sk->type; | |
3621 | --- linux/net/ipv4/tcp_output.c.orig Tue Feb 5 13:51:51 2002 | |
3622 | +++ linux/net/ipv4/tcp_output.c Tue Feb 5 13:52:12 2002 | |
3623 | @@ -1009,8 +1009,7 @@ | |
3624 | skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL); | |
3625 | if (skb) | |
3626 | break; | |
3627 | - current->policy |= SCHED_YIELD; | |
3628 | - schedule(); | |
3629 | + yield(); | |
3630 | } | |
3631 | ||
3632 | /* Reserve space for headers and prepare control bits. */ | |
3633 | --- linux/net/sunrpc/sched.c.orig Tue Feb 5 13:51:53 2002 | |
3634 | +++ linux/net/sunrpc/sched.c Tue Feb 5 13:52:12 2002 | |
3635 | @@ -773,8 +773,7 @@ | |
3636 | } | |
3637 | if (flags & RPC_TASK_ASYNC) | |
3638 | return NULL; | |
3639 | - current->policy |= SCHED_YIELD; | |
3640 | - schedule(); | |
3641 | + yield(); | |
3642 | } while (!signalled()); | |
3643 | ||
3644 | return NULL; | |
3645 | @@ -1115,8 +1114,7 @@ | |
3646 | __rpc_schedule(); | |
3647 | if (all_tasks) { | |
3648 | dprintk("rpciod_killall: waiting for tasks to exit\n"); | |
3649 | - current->policy |= SCHED_YIELD; | |
3650 | - schedule(); | |
3651 | + yield(); | |
3652 | } | |
3653 | } | |
3654 | ||
3655 | @@ -1186,8 +1184,7 @@ | |
3656 | * wait briefly before checking the process id. | |
3657 | */ | |
3658 | current->sigpending = 0; | |
3659 | - current->policy |= SCHED_YIELD; | |
3660 | - schedule(); | |
3661 | + yield(); | |
3662 | /* | |
3663 | * Display a message if we're going to wait longer. | |
3664 | */ | |
3665 | --- linux/net/sched/sch_generic.c.orig Fri Aug 18 19:26:25 2000 | |
3666 | +++ linux/net/sched/sch_generic.c Tue Feb 5 13:52:12 2002 | |
3667 | @@ -475,10 +475,8 @@ | |
3668 | ||
3669 | dev_watchdog_down(dev); | |
3670 | ||
3671 | - while (test_bit(__LINK_STATE_SCHED, &dev->state)) { | |
3672 | - current->policy |= SCHED_YIELD; | |
3673 | - schedule(); | |
3674 | - } | |
3675 | + while (test_bit(__LINK_STATE_SCHED, &dev->state)) | |
3676 | + yield(); | |
3677 | ||
3678 | spin_unlock_wait(&dev->xmit_lock); | |
3679 | } | |
3680 | --- linux/net/socket.c.orig Tue Feb 5 13:51:51 2002 | |
3681 | +++ linux/net/socket.c Tue Feb 5 13:52:12 2002 | |
3682 | @@ -148,8 +148,7 @@ | |
3683 | while (atomic_read(&net_family_lockct) != 0) { | |
3684 | spin_unlock(&net_family_lock); | |
3685 | ||
3686 | - current->policy |= SCHED_YIELD; | |
3687 | - schedule(); | |
3688 | + yield(); | |
3689 | ||
3690 | spin_lock(&net_family_lock); | |
3691 | } | |
3692 | --- linux/drivers/net/slip.c.orig Tue Feb 5 13:51:52 2002 | |
3693 | +++ linux/drivers/net/slip.c Tue Feb 5 13:52:12 2002 | |
3694 | @@ -1393,10 +1393,8 @@ | |
3695 | /* First of all: check for active disciplines and hangup them. | |
3696 | */ | |
3697 | do { | |
3698 | - if (busy) { | |
3699 | - current->counter = 0; | |
3700 | - schedule(); | |
3701 | - } | |
3702 | + if (busy) | |
3703 | + sys_sched_yield(); | |
3704 | ||
3705 | busy = 0; | |
3706 | local_bh_disable(); | |
3707 | --- linux/drivers/block/loop.c.orig Tue Feb 5 13:51:50 2002 | |
3708 | +++ linux/drivers/block/loop.c Tue Feb 5 13:52:12 2002 | |
3709 | @@ -570,9 +570,6 @@ | |
3710 | flush_signals(current); | |
3711 | spin_unlock_irq(¤t->sigmask_lock); | |
3712 | ||
3713 | - current->policy = SCHED_OTHER; | |
3714 | - current->nice = -20; | |
3715 | - | |
3716 | spin_lock_irq(&lo->lo_lock); | |
3717 | lo->lo_state = Lo_bound; | |
3718 | atomic_inc(&lo->lo_pending); | |
3719 | --- linux/drivers/char/mwave/mwavedd.c.orig Tue Feb 5 13:51:44 2002 | |
3720 | +++ linux/drivers/char/mwave/mwavedd.c Tue Feb 5 13:52:12 2002 | |
3721 | @@ -279,7 +279,6 @@ | |
3722 | pDrvData->IPCs[ipcnum].bIsHere = FALSE; | |
3723 | pDrvData->IPCs[ipcnum].bIsEnabled = TRUE; | |
3724 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0) | |
3725 | - current->nice = -20; /* boost to provide priority timing */ | |
3726 | #else | |
3727 | current->priority = 0x28; /* boost to provide priority timing */ | |
3728 | #endif | |
3729 | --- linux/drivers/char/drm-4.0/ffb_drv.c.orig Tue Feb 5 13:51:51 2002 | |
3730 | +++ linux/drivers/char/drm-4.0/ffb_drv.c Tue Feb 5 13:52:12 2002 | |
3731 | @@ -710,8 +710,7 @@ | |
3732 | /* Contention */ | |
3733 | atomic_inc(&dev->total_sleeps); | |
3734 | current->state = TASK_INTERRUPTIBLE; | |
3735 | - current->policy |= SCHED_YIELD; | |
3736 | - schedule(); | |
3737 | + yield(); | |
3738 | if (signal_pending(current)) { | |
3739 | ret = -ERESTARTSYS; | |
3740 | break; | |
3741 | --- linux/drivers/char/drm-4.0/tdfx_drv.c.orig Tue Feb 5 13:51:52 2002 | |
3742 | +++ linux/drivers/char/drm-4.0/tdfx_drv.c Tue Feb 5 13:52:12 2002 | |
3743 | @@ -554,7 +554,6 @@ | |
3744 | lock.context, current->pid, j, | |
3745 | dev->lock.lock_time, jiffies); | |
3746 | current->state = TASK_INTERRUPTIBLE; | |
3747 | - current->policy |= SCHED_YIELD; | |
3748 | schedule_timeout(DRM_LOCK_SLICE-j); | |
3749 | DRM_DEBUG("jiffies=%d\n", jiffies); | |
3750 | } | |
3751 | @@ -578,10 +577,7 @@ | |
3752 | ||
3753 | /* Contention */ | |
3754 | atomic_inc(&dev->total_sleeps); | |
3755 | -#if 1 | |
3756 | - current->policy |= SCHED_YIELD; | |
3757 | -#endif | |
3758 | - schedule(); | |
3759 | + yield(); | |
3760 | if (signal_pending(current)) { | |
3761 | ret = -ERESTARTSYS; | |
3762 | break; | |
3763 | @@ -604,8 +600,7 @@ | |
3764 | when dev->last_context == lock.context | |
3765 | NOTE WE HOLD THE LOCK THROUGHOUT THIS | |
3766 | TIME! */ | |
3767 | - current->policy |= SCHED_YIELD; | |
3768 | - schedule(); | |
3769 | + yield(); | |
3770 | current->state = TASK_RUNNING; | |
3771 | remove_wait_queue(&dev->context_wait, &entry); | |
3772 | if (signal_pending(current)) { | |
3773 | --- linux/drivers/ide/ataraid.c.orig Tue Feb 5 13:51:46 2002 | |
3774 | +++ linux/drivers/ide/ataraid.c Tue Feb 5 13:52:12 2002 | |
3775 | @@ -123,8 +123,7 @@ | |
3776 | ptr=kmalloc(sizeof(struct buffer_head),GFP_NOIO); | |
3777 | if (!ptr) { | |
3778 | __set_current_state(TASK_RUNNING); | |
3779 | - current->policy |= SCHED_YIELD; | |
3780 | - schedule(); | |
3781 | + yield(); | |
3782 | } | |
3783 | } | |
3784 | return ptr; | |
3785 | @@ -139,8 +138,7 @@ | |
3786 | ptr=kmalloc(sizeof(struct ataraid_bh_private),GFP_NOIO); | |
3787 | if (!ptr) { | |
3788 | __set_current_state(TASK_RUNNING); | |
3789 | - current->policy |= SCHED_YIELD; | |
3790 | - schedule(); | |
3791 | + yield(); | |
3792 | } | |
3793 | } | |
3794 | return ptr; | |
3795 | --- linux/drivers/md/md.c.orig Tue Feb 5 13:51:52 2002 | |
3796 | +++ linux/drivers/md/md.c Tue Feb 5 13:52:12 2002 | |
3797 | @@ -2936,8 +2936,6 @@ | |
3798 | * bdflush, otherwise bdflush will deadlock if there are too | |
3799 | * many dirty RAID5 blocks. | |
3800 | */ | |
3801 | - current->policy = SCHED_OTHER; | |
3802 | - current->nice = -20; | |
3803 | md_unlock_kernel(); | |
3804 | ||
3805 | complete(thread->event); | |
3806 | @@ -3387,11 +3385,6 @@ | |
3807 | "(but not more than %d KB/sec) for reconstruction.\n", | |
3808 | sysctl_speed_limit_max); | |
3809 | ||
3810 | - /* | |
3811 | - * Resync has low priority. | |
3812 | - */ | |
3813 | - current->nice = 19; | |
3814 | - | |
3815 | is_mddev_idle(mddev); /* this also initializes IO event counters */ | |
3816 | for (m = 0; m < SYNC_MARKS; m++) { | |
3817 | mark[m] = jiffies; | |
3818 | @@ -3469,16 +3462,13 @@ | |
3819 | currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; | |
3820 | ||
3821 | if (currspeed > sysctl_speed_limit_min) { | |
3822 | - current->nice = 19; | |
3823 | - | |
3824 | if ((currspeed > sysctl_speed_limit_max) || | |
3825 | !is_mddev_idle(mddev)) { | |
3826 | current->state = TASK_INTERRUPTIBLE; | |
3827 | md_schedule_timeout(HZ/4); | |
3828 | goto repeat; | |
3829 | } | |
3830 | - } else | |
3831 | - current->nice = -20; | |
3832 | + } | |
3833 | } | |
3834 | printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); | |
3835 | err = 0; | |
3836 | --- linux/arch/i386/mm/fault.c.orig Tue Feb 5 13:51:51 2002 | |
3837 | +++ linux/arch/i386/mm/fault.c Tue Feb 5 13:52:12 2002 | |
3838 | @@ -86,8 +86,7 @@ | |
3839 | ||
3840 | out_of_memory: | |
3841 | if (current->pid == 1) { | |
3842 | - current->policy |= SCHED_YIELD; | |
3843 | - schedule(); | |
3844 | + yield(); | |
3845 | goto survive; | |
3846 | } | |
3847 | goto bad_area; | |
3848 | @@ -342,8 +341,7 @@ | |
3849 | out_of_memory: | |
3850 | up_read(&mm->mmap_sem); | |
3851 | if (tsk->pid == 1) { | |
3852 | - tsk->policy |= SCHED_YIELD; | |
3853 | - schedule(); | |
3854 | + yield(); | |
3855 | down_read(&mm->mmap_sem); | |
3856 | goto survive; | |
3857 | } | |
3858 | --- linux/arch/i386/kernel/smpboot.c.orig Tue Feb 5 13:51:49 2002 | |
3859 | +++ linux/arch/i386/kernel/smpboot.c Tue Feb 5 13:52:12 2002 | |
3860 | @@ -308,14 +308,14 @@ | |
3861 | if (tsc_values[i] < avg) | |
3862 | realdelta = -realdelta; | |
3863 | ||
3864 | - printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", | |
3865 | - i, realdelta); | |
3866 | + printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", i, realdelta); | |
3867 | } | |
3868 | ||
3869 | sum += delta; | |
3870 | } | |
3871 | if (!buggy) | |
3872 | printk("passed.\n"); | |
3873 | + ; | |
3874 | } | |
3875 | ||
3876 | static void __init synchronize_tsc_ap (void) | |
3877 | @@ -365,7 +365,7 @@ | |
3878 | * (This works even if the APIC is not enabled.) | |
3879 | */ | |
3880 | phys_id = GET_APIC_ID(apic_read(APIC_ID)); | |
3881 | - cpuid = current->processor; | |
3882 | + cpuid = cpu(); | |
3883 | if (test_and_set_bit(cpuid, &cpu_online_map)) { | |
3884 | printk("huh, phys CPU#%d, CPU#%d already present??\n", | |
3885 | phys_id, cpuid); | |
3886 | @@ -435,6 +435,7 @@ | |
3887 | */ | |
3888 | smp_store_cpu_info(cpuid); | |
3889 | ||
3890 | + disable_APIC_timer(); | |
3891 | /* | |
3892 | * Allow the master to continue. | |
3893 | */ | |
3894 | @@ -465,6 +466,7 @@ | |
3895 | smp_callin(); | |
3896 | while (!atomic_read(&smp_commenced)) | |
3897 | rep_nop(); | |
3898 | + enable_APIC_timer(); | |
3899 | /* | |
3900 | * low-memory mappings have been cleared, flush them from | |
3901 | * the local TLBs too. | |
3902 | @@ -803,16 +805,13 @@ | |
3903 | if (!idle) | |
3904 | panic("No idle process for CPU %d", cpu); | |
3905 | ||
3906 | - idle->processor = cpu; | |
3907 | - idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */ | |
3908 | + init_idle(idle, cpu); | |
3909 | ||
3910 | map_cpu_to_boot_apicid(cpu, apicid); | |
3911 | ||
3912 | idle->thread.eip = (unsigned long) start_secondary; | |
3913 | ||
3914 | - del_from_runqueue(idle); | |
3915 | unhash_process(idle); | |
3916 | - init_tasks[cpu] = idle; | |
3917 | ||
3918 | /* start_eip had better be page-aligned! */ | |
3919 | start_eip = setup_trampoline(); | |
3920 | @@ -925,6 +924,7 @@ | |
3921 | } | |
3922 | ||
3923 | cycles_t cacheflush_time; | |
3924 | +unsigned long cache_decay_ticks; | |
3925 | ||
3926 | static void smp_tune_scheduling (void) | |
3927 | { | |
3928 | @@ -958,9 +958,13 @@ | |
3929 | cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth; | |
3930 | } | |
3931 | ||
3932 | + cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000; | |
3933 | + | |
3934 | printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n", | |
3935 | (long)cacheflush_time/(cpu_khz/1000), | |
3936 | ((long)cacheflush_time*100/(cpu_khz/1000)) % 100); | |
3937 | + printk("task migration cache decay timeout: %ld msecs.\n", | |
3938 | + (cache_decay_ticks + 1) * 1000 / HZ); | |
3939 | } | |
3940 | ||
3941 | /* | |
3942 | @@ -1020,8 +1024,7 @@ | |
3943 | map_cpu_to_boot_apicid(0, boot_cpu_apicid); | |
3944 | ||
3945 | global_irq_holder = 0; | |
3946 | - current->processor = 0; | |
3947 | - init_idle(); | |
3948 | + current->cpu = 0; | |
3949 | smp_tune_scheduling(); | |
3950 | ||
3951 | /* | |
3952 | --- linux/arch/i386/kernel/process.c.orig Tue Feb 5 13:51:51 2002 | |
3953 | +++ linux/arch/i386/kernel/process.c Tue Feb 5 13:52:12 2002 | |
3954 | @@ -123,15 +123,12 @@ | |
3955 | void cpu_idle (void) | |
3956 | { | |
3957 | /* endless idle loop with no priority at all */ | |
3958 | - init_idle(); | |
3959 | - current->nice = 20; | |
3960 | - current->counter = -100; | |
3961 | ||
3962 | while (1) { | |
3963 | void (*idle)(void) = pm_idle; | |
3964 | if (!idle) | |
3965 | idle = default_idle; | |
3966 | - while (!current->need_resched) | |
3967 | + if (!current->need_resched) | |
3968 | idle(); | |
3969 | schedule(); | |
3970 | check_pgt_cache(); | |
3971 | @@ -694,15 +691,17 @@ | |
3972 | asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs)); | |
3973 | ||
3974 | /* | |
3975 | - * Restore %fs and %gs. | |
3976 | + * Restore %fs and %gs if needed. | |
3977 | */ | |
3978 | - loadsegment(fs, next->fs); | |
3979 | - loadsegment(gs, next->gs); | |
3980 | + if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) { | |
3981 | + loadsegment(fs, next->fs); | |
3982 | + loadsegment(gs, next->gs); | |
3983 | + } | |
3984 | ||
3985 | /* | |
3986 | * Now maybe reload the debug registers | |
3987 | */ | |
3988 | - if (next->debugreg[7]){ | |
3989 | + if (unlikely(next->debugreg[7])) { | |
3990 | loaddebug(next, 0); | |
3991 | loaddebug(next, 1); | |
3992 | loaddebug(next, 2); | |
3993 | @@ -712,7 +711,7 @@ | |
3994 | loaddebug(next, 7); | |
3995 | } | |
3996 | ||
3997 | - if (prev->ioperm || next->ioperm) { | |
3998 | + if (unlikely(prev->ioperm || next->ioperm)) { | |
3999 | if (next->ioperm) { | |
4000 | /* | |
4001 | * 4 cachelines copy ... not good, but not that | |
4002 | --- linux/arch/i386/kernel/apic.c.orig Tue Feb 5 13:51:51 2002 | |
4003 | +++ linux/arch/i386/kernel/apic.c Tue Feb 5 13:52:12 2002 | |
4004 | @@ -796,8 +796,7 @@ | |
4005 | */ | |
4006 | ||
4007 | slice = clocks / (smp_num_cpus+1); | |
4008 | - printk("cpu: %d, clocks: %d, slice: %d\n", | |
4009 | - smp_processor_id(), clocks, slice); | |
4010 | + printk("cpu: %d, clocks: %d, slice: %d\n", smp_processor_id(), clocks, slice); | |
4011 | ||
4012 | /* | |
4013 | * Wait for IRQ0's slice: | |
4014 | @@ -820,8 +819,7 @@ | |
4015 | ||
4016 | __setup_APIC_LVTT(clocks); | |
4017 | ||
4018 | - printk("CPU%d<T0:%d,T1:%d,D:%d,S:%d,C:%d>\n", | |
4019 | - smp_processor_id(), t0, t1, delta, slice, clocks); | |
4020 | + printk("CPU%d<T0:%d,T1:%d,D:%d,S:%d,C:%d>\n", smp_processor_id(), t0, t1, delta, slice, clocks); | |
4021 | ||
4022 | __restore_flags(flags); | |
4023 | } | |
4024 | @@ -922,6 +920,26 @@ | |
4025 | ||
4026 | /* and update all other cpus */ | |
4027 | smp_call_function(setup_APIC_timer, (void *)calibration_result, 1, 1); | |
4028 | +} | |
4029 | + | |
4030 | +void __init disable_APIC_timer(void) | |
4031 | +{ | |
4032 | + if (using_apic_timer) { | |
4033 | + unsigned long v; | |
4034 | + | |
4035 | + v = apic_read(APIC_LVTT); | |
4036 | + apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); | |
4037 | + } | |
4038 | +} | |
4039 | + | |
4040 | +void enable_APIC_timer(void) | |
4041 | +{ | |
4042 | + if (using_apic_timer) { | |
4043 | + unsigned long v; | |
4044 | + | |
4045 | + v = apic_read(APIC_LVTT); | |
4046 | + apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED); | |
4047 | + } | |
4048 | } | |
4049 | ||
4050 | /* | |
4051 | --- linux/arch/i386/kernel/nmi.c.orig Tue Feb 5 13:51:36 2002 | |
4052 | +++ linux/arch/i386/kernel/nmi.c Tue Feb 5 13:52:12 2002 | |
4053 | @@ -283,7 +283,7 @@ | |
4054 | * to get a message out. | |
4055 | */ | |
4056 | bust_spinlocks(1); | |
4057 | - printk("NMI Watchdog detected LOCKUP on CPU%d, registers:\n", cpu); | |
4058 | + printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip); | |
4059 | show_registers(regs); | |
4060 | printk("console shuts up ...\n"); | |
4061 | console_silent(); | |
4062 | --- linux/arch/i386/kernel/smp.c.orig Tue Feb 5 13:51:49 2002 | |
4063 | +++ linux/arch/i386/kernel/smp.c Tue Feb 5 13:52:12 2002 | |
4064 | @@ -105,7 +105,7 @@ | |
4065 | /* The 'big kernel lock' */ | |
4066 | spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; | |
4067 | ||
4068 | -struct tlb_state cpu_tlbstate[NR_CPUS] = {[0 ... NR_CPUS-1] = { &init_mm, 0 }}; | |
4069 | +struct tlb_state cpu_tlbstate[NR_CPUS] __cacheline_aligned = {[0 ... NR_CPUS-1] = { &init_mm, 0, }}; | |
4070 | ||
4071 | /* | |
4072 | * the following functions deal with sending IPIs between CPUs. | |
4073 | @@ -485,15 +485,54 @@ | |
4074 | do_flush_tlb_all_local(); | |
4075 | } | |
4076 | ||
4077 | +static spinlock_t migration_lock = SPIN_LOCK_UNLOCKED; | |
4078 | +static task_t *new_task; | |
4079 | + | |
4080 | +/* | |
4081 | + * This function sends a 'task migration' IPI to another CPU. | |
4082 | + * Must be called from syscall contexts, with interrupts *enabled*. | |
4083 | + */ | |
4084 | +void smp_migrate_task(int cpu, task_t *p) | |
4085 | +{ | |
4086 | + /* | |
4087 | + * The target CPU will unlock the migration spinlock: | |
4088 | + */ | |
4089 | + spin_lock(&migration_lock); | |
4090 | + new_task = p; | |
4091 | + send_IPI_mask(1 << cpu, TASK_MIGRATION_VECTOR); | |
4092 | +} | |
4093 | + | |
4094 | +/* | |
4095 | + * Task migration callback. | |
4096 | + */ | |
4097 | +asmlinkage void smp_task_migration_interrupt(void) | |
4098 | +{ | |
4099 | + task_t *p; | |
4100 | + | |
4101 | + ack_APIC_irq(); | |
4102 | + p = new_task; | |
4103 | + spin_unlock(&migration_lock); | |
4104 | + sched_task_migrated(p); | |
4105 | +} | |
4106 | /* | |
4107 | * this function sends a 'reschedule' IPI to another CPU. | |
4108 | * it goes straight through and wastes no time serializing | |
4109 | * anything. Worst case is that we lose a reschedule ... | |
4110 | */ | |
4111 | - | |
4112 | void smp_send_reschedule(int cpu) | |
4113 | { | |
4114 | send_IPI_mask(1 << cpu, RESCHEDULE_VECTOR); | |
4115 | +} | |
4116 | + | |
4117 | +/* | |
4118 | + * this function sends a reschedule IPI to all (other) CPUs. | |
4119 | + * This should only be used if some 'global' task became runnable, | |
4120 | + * such as a RT task, that must be handled now. The first CPU | |
4121 | + * that manages to grab the task will run it. | |
4122 | + */ | |
4123 | +void smp_send_reschedule_all(void) | |
4124 | +{ | |
4125 | + send_IPI_allbutself(RESCHEDULE_VECTOR); | |
4126 | } | |
4127 | ||
4128 | /* | |
4129 | --- linux/arch/i386/kernel/i8259.c.orig Tue Feb 5 13:51:36 2002 | |
4130 | +++ linux/arch/i386/kernel/i8259.c Tue Feb 5 13:52:12 2002 | |
4131 | @@ -79,6 +79,7 @@ | |
4132 | * through the ICC by us (IPIs) | |
4133 | */ | |
4134 | #ifdef CONFIG_SMP | |
4135 | +BUILD_SMP_INTERRUPT(task_migration_interrupt,TASK_MIGRATION_VECTOR) | |
4136 | BUILD_SMP_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) | |
4137 | BUILD_SMP_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR) | |
4138 | BUILD_SMP_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) | |
4139 | @@ -472,6 +473,9 @@ | |
4140 | * IPI, driven by wakeup. | |
4141 | */ | |
4142 | set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); | |
4143 | + | |
4144 | + /* IPI for task migration */ | |
4145 | + set_intr_gate(TASK_MIGRATION_VECTOR, task_migration_interrupt); | |
4146 | ||
4147 | /* IPI for invalidation */ | |
4148 | set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); | |
4149 | --- linux/arch/i386/kernel/entry.S.orig Tue Feb 5 13:51:51 2002 | |
4150 | +++ linux/arch/i386/kernel/entry.S Tue Feb 5 13:52:12 2002 | |
4151 | @@ -77,7 +77,7 @@ | |
4152 | exec_domain = 16 | |
4153 | need_resched = 20 | |
4154 | tsk_ptrace = 24 | |
4155 | -processor = 52 | |
4156 | +cpu = 32 | |
4157 | ||
4158 | ENOSYS = 38 | |
4159 | ||
4160 | @@ -176,9 +176,11 @@ | |
4161 | ||
4162 | ||
4163 | ENTRY(ret_from_fork) | |
4164 | +#if CONFIG_SMP | |
4165 | pushl %ebx | |
4166 | call SYMBOL_NAME(schedule_tail) | |
4167 | addl $4, %esp | |
4168 | +#endif | |
4169 | GET_CURRENT(%ebx) | |
4170 | testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS | |
4171 | jne tracesys_exit | |
4172 | --- linux/arch/i386/kernel/setup.c.orig Tue Feb 5 13:51:51 2002 | |
4173 | +++ linux/arch/i386/kernel/setup.c Tue Feb 5 13:52:12 2002 | |
4174 | @@ -2924,9 +2924,10 @@ | |
4175 | load_TR(nr); | |
4176 | load_LDT(&init_mm); | |
4177 | ||
4178 | - /* | |
4179 | - * Clear all 6 debug registers: | |
4180 | - */ | |
4181 | + /* Clear %fs and %gs. */ | |
4182 | + asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); | |
4183 | + | |
4184 | + /* Clear all 6 debug registers: */ | |
4185 | ||
4186 | #define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) ); | |
4187 |