1 diff -urN linux-2.4.24.org/arch/alpha/kernel/entry.S linux-2.4.24/arch/alpha/kernel/entry.S
2 --- linux-2.4.24.org/arch/alpha/kernel/entry.S 2004-02-04 20:50:50.273627588 +0100
3 +++ linux-2.4.24/arch/alpha/kernel/entry.S 2004-02-04 20:52:52.801142450 +0100
6 lda $26,ret_from_sys_call
14 diff -urN linux-2.4.24.org/arch/alpha/kernel/process.c linux-2.4.24/arch/alpha/kernel/process.c
15 --- linux-2.4.24.org/arch/alpha/kernel/process.c 2004-02-04 20:50:48.800933904 +0100
16 +++ linux-2.4.24/arch/alpha/kernel/process.c 2004-02-04 20:52:52.805141619 +0100
20 /* An endless idle loop with no priority at all. */
22 - current->counter = -100;
25 /* FIXME -- EV6 and LCA45 know how to power down
27 diff -urN linux-2.4.24.org/arch/alpha/kernel/smp.c linux-2.4.24/arch/alpha/kernel/smp.c
28 --- linux-2.4.24.org/arch/alpha/kernel/smp.c 2004-02-04 20:50:49.083875053 +0100
29 +++ linux-2.4.24/arch/alpha/kernel/smp.c 2004-02-04 20:52:52.820138499 +0100
31 int smp_num_probed; /* Internal processor count */
32 int smp_num_cpus = 1; /* Number that came online. */
33 int smp_threads_ready; /* True once the per process idle is forked. */
34 +cycles_t cacheflush_time;
35 +unsigned long cache_decay_ticks;
37 int __cpu_number_map[NR_CPUS];
38 int __cpu_logical_map[NR_CPUS];
41 int cpuid = hard_smp_processor_id();
43 - if (current != init_tasks[cpu_number_map(cpuid)]) {
44 - printk("BUG: smp_calling: cpu %d current %p init_tasks[cpu_number_map(cpuid)] %p\n",
45 - cpuid, current, init_tasks[cpu_number_map(cpuid)]);
48 DBGS(("CALLIN %d state 0x%lx\n", cpuid, current->state));
50 /* Turn on machine checks. */
52 DBGS(("smp_callin: commencing CPU %d current %p\n",
55 - /* Setup the scheduler for this processor. */
58 /* ??? This should be in init_idle. */
59 atomic_inc(&init_mm.mm_count);
60 current->active_mm = &init_mm;
67 + * Rough estimation for SMP scheduling, this is the number of cycles it
68 + * takes for a fully memory-limited process to flush the SMP-local cache.
70 + * We are not told how much cache there is, so we have to guess.
73 +smp_tune_scheduling (int cpuid)
75 + struct percpu_struct *cpu;
76 + unsigned long on_chip_cache; /* kB */
77 + unsigned long freq; /* Hz */
78 + unsigned long bandwidth = 350; /* MB/s */
80 + cpu = (struct percpu_struct*)((char*)hwrpb + hwrpb->processor_offset
81 + + cpuid * hwrpb->processor_size);
85 + on_chip_cache = 16 + 16;
90 + on_chip_cache = 8 + 8 + 96;
94 + on_chip_cache = 16 + 8;
100 + on_chip_cache = 64 + 64;
104 + freq = hwrpb->cycle_freq ? : est_cycle_freq;
106 + cacheflush_time = (freq / 1000000) * (on_chip_cache << 10) / bandwidth;
107 + cache_decay_ticks = cacheflush_time / (freq / 1000) * HZ / 1000;
109 + printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
110 + cacheflush_time/(freq/1000000),
111 + (cacheflush_time*100/(freq/1000000)) % 100);
112 + printk("task migration cache decay timeout: %ld msecs.\n",
113 + (cache_decay_ticks + 1) * 1000 / HZ);
117 * Send a message to a secondary's console. "START" is one such
118 * interesting message. ;-)
119 @@ -449,14 +494,11 @@
120 if (idle == &init_task)
121 panic("idle process is init_task for CPU %d", cpuid);
123 - idle->processor = cpuid;
124 - idle->cpus_runnable = 1 << cpuid; /* we schedule the first task manually */
125 + init_idle(idle, cpuid);
126 + unhash_process(idle);
128 __cpu_logical_map[cpunum] = cpuid;
129 __cpu_number_map[cpuid] = cpunum;
131 - del_from_runqueue(idle);
132 - unhash_process(idle);
133 - init_tasks[cpunum] = idle;
135 DBGS(("smp_boot_one_cpu: CPU %d state 0x%lx flags 0x%lx\n",
136 cpuid, idle->state, idle->flags));
137 @@ -563,13 +605,11 @@
139 __cpu_number_map[boot_cpuid] = 0;
140 __cpu_logical_map[0] = boot_cpuid;
141 - current->processor = boot_cpuid;
143 smp_store_cpu_info(boot_cpuid);
144 + smp_tune_scheduling(boot_cpuid);
145 smp_setup_percpu_timer(boot_cpuid);
149 /* ??? This should be in init_idle. */
150 atomic_inc(&init_mm.mm_count);
151 current->active_mm = &init_mm;
152 diff -urN linux-2.4.24.org/arch/arm/kernel/process.c linux-2.4.24/arch/arm/kernel/process.c
153 --- linux-2.4.24.org/arch/arm/kernel/process.c 2004-02-04 20:51:34.213488266 +0100
154 +++ linux-2.4.24/arch/arm/kernel/process.c 2004-02-04 20:52:52.824137668 +0100
157 /* endless idle loop with no priority at all */
159 - current->nice = 20;
160 - current->counter = -100;
163 void (*idle)(void) = pm_idle;
164 diff -urN linux-2.4.24.org/arch/i386/kernel/entry.S linux-2.4.24/arch/i386/kernel/entry.S
165 --- linux-2.4.24.org/arch/i386/kernel/entry.S 2004-02-04 20:50:47.376230238 +0100
166 +++ linux-2.4.24/arch/i386/kernel/entry.S 2004-02-04 20:52:52.828136836 +0100
182 call SYMBOL_NAME(schedule_tail)
186 testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS
188 diff -urN linux-2.4.24.org/arch/i386/kernel/process.c linux-2.4.24/arch/i386/kernel/process.c
189 --- linux-2.4.24.org/arch/i386/kernel/process.c 2004-02-04 20:50:46.799350227 +0100
190 +++ linux-2.4.24/arch/i386/kernel/process.c 2004-02-04 20:52:52.833135796 +0100
193 if (current_cpu_data.hlt_works_ok && !hlt_counter) {
195 - if (!current->need_resched)
196 + if (!need_resched())
203 /* endless idle loop with no priority at all */
205 - current->nice = 20;
206 - current->counter = -100;
209 void (*idle)(void) = pm_idle;
210 @@ -665,15 +662,17 @@
211 asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
214 - * Restore %fs and %gs.
215 + * Restore %fs and %gs if needed.
217 - loadsegment(fs, next->fs);
218 - loadsegment(gs, next->gs);
219 + if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) {
220 + loadsegment(fs, next->fs);
221 + loadsegment(gs, next->gs);
225 * Now maybe reload the debug registers
227 - if (next->debugreg[7]){
228 + if (unlikely(next->debugreg[7])) {
236 - if (prev->ioperm || next->ioperm) {
237 + if (unlikely(prev->ioperm || next->ioperm)) {
240 * 4 cachelines copy ... not good, but not that
241 diff -urN linux-2.4.24.org/arch/i386/kernel/setup.c linux-2.4.24/arch/i386/kernel/setup.c
242 --- linux-2.4.24.org/arch/i386/kernel/setup.c 2004-02-04 20:50:46.790352099 +0100
243 +++ linux-2.4.24/arch/i386/kernel/setup.c 2004-02-04 20:52:52.840134340 +0100
244 @@ -3193,9 +3193,10 @@
246 load_LDT(&init_mm.context);
249 - * Clear all 6 debug registers:
251 + /* Clear %fs and %gs. */
252 + asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
254 + /* Clear all 6 debug registers: */
256 #define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) );
258 diff -urN linux-2.4.24.org/arch/i386/kernel/smpboot.c linux-2.4.24/arch/i386/kernel/smpboot.c
259 --- linux-2.4.24.org/arch/i386/kernel/smpboot.c 2004-02-04 20:50:46.762357921 +0100
260 +++ linux-2.4.24/arch/i386/kernel/smpboot.c 2004-02-04 20:52:52.864129350 +0100
261 @@ -308,14 +308,14 @@
262 if (tsc_values[i] < avg)
263 realdelta = -realdelta;
265 - printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n",
267 + printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", i, realdelta);
277 static void __init synchronize_tsc_ap (void)
279 * (This works even if the APIC is not enabled.)
281 phys_id = GET_APIC_ID(apic_read(APIC_ID));
282 - cpuid = current->processor;
284 if (test_and_set_bit(cpuid, &cpu_online_map)) {
285 printk("huh, phys CPU#%d, CPU#%d already present??\n",
289 smp_store_cpu_info(cpuid);
291 + disable_APIC_timer();
293 * Allow the master to continue.
297 while (!atomic_read(&smp_commenced))
299 + enable_APIC_timer();
301 * low-memory mappings have been cleared, flush them from
302 * the local TLBs too.
303 @@ -803,16 +805,13 @@
305 panic("No idle process for CPU %d", cpu);
307 - idle->processor = cpu;
308 - idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */
309 + init_idle(idle, cpu);
311 map_cpu_to_boot_apicid(cpu, apicid);
313 idle->thread.eip = (unsigned long) start_secondary;
315 - del_from_runqueue(idle);
316 unhash_process(idle);
317 - init_tasks[cpu] = idle;
319 /* start_eip had better be page-aligned! */
320 start_eip = setup_trampoline();
324 cycles_t cacheflush_time;
325 +unsigned long cache_decay_ticks;
327 static void smp_tune_scheduling (void)
330 cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth;
333 + cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000;
335 printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
336 (long)cacheflush_time/(cpu_khz/1000),
337 ((long)cacheflush_time*100/(cpu_khz/1000)) % 100);
338 + printk("task migration cache decay timeout: %ld msecs.\n",
339 + (cache_decay_ticks + 1) * 1000 / HZ);
343 @@ -1026,8 +1030,7 @@
344 map_cpu_to_boot_apicid(0, boot_cpu_apicid);
346 global_irq_holder = 0;
347 - current->processor = 0;
350 smp_tune_scheduling();
353 diff -urN linux-2.4.24.org/arch/i386/kernel/smp.c linux-2.4.24/arch/i386/kernel/smp.c
354 --- linux-2.4.24.org/arch/i386/kernel/smp.c 2004-02-04 20:50:47.312243547 +0100
355 +++ linux-2.4.24/arch/i386/kernel/smp.c 2004-02-04 20:52:52.868128518 +0100
360 + * this function sends a reschedule IPI to all (other) CPUs.
361 + * This should only be used if some 'global' task became runnable,
362 + * such as a RT task, that must be handled now. The first CPU
363 + * that manages to grab the task will run it.
365 +void smp_send_reschedule_all(void)
367 + send_IPI_allbutself(RESCHEDULE_VECTOR);
371 * Structure and data for smp_call_function(). This is designed to minimise
372 * static memory requirements. It also looks cleaner.
374 diff -urN linux-2.4.24.org/arch/mips64/kernel/process.c linux-2.4.24/arch/mips64/kernel/process.c
375 --- linux-2.4.24.org/arch/mips64/kernel/process.c 2004-02-04 20:51:53.268524907 +0100
376 +++ linux-2.4.24/arch/mips64/kernel/process.c 2004-02-04 20:52:52.872127686 +0100
379 /* endless idle loop with no priority at all */
381 - current->nice = 20;
382 - current->counter = -100;
385 while (!current->need_resched)
387 diff -urN linux-2.4.24.org/arch/parisc/kernel/process.c linux-2.4.24/arch/parisc/kernel/process.c
388 --- linux-2.4.24.org/arch/parisc/kernel/process.c 2004-02-04 20:51:58.602415484 +0100
389 +++ linux-2.4.24/arch/parisc/kernel/process.c 2004-02-04 20:52:52.876126854 +0100
392 /* endless idle loop with no priority at all */
394 - current->nice = 20;
395 - current->counter = -100;
398 while (!current->need_resched) {
399 diff -urN linux-2.4.24.org/arch/ppc/kernel/entry.S linux-2.4.24/arch/ppc/kernel/entry.S
400 --- linux-2.4.24.org/arch/ppc/kernel/entry.S 2004-02-04 20:51:15.913294629 +0100
401 +++ linux-2.4.24/arch/ppc/kernel/entry.S 2004-02-04 20:52:52.903121239 +0100
409 lwz r0,TASK_PTRACE(r2)
410 andi. r0,r0,PT_TRACESYS
412 diff -urN linux-2.4.24.org/arch/ppc/kernel/idle.c linux-2.4.24/arch/ppc/kernel/idle.c
413 --- linux-2.4.24.org/arch/ppc/kernel/idle.c 2004-02-04 20:51:16.300214151 +0100
414 +++ linux-2.4.24/arch/ppc/kernel/idle.c 2004-02-04 20:52:52.908120200 +0100
418 /* endless loop with no priority at all */
419 - current->nice = 20;
420 - current->counter = -100;
425 if (!do_power_save) {
426 diff -urN linux-2.4.24.org/arch/ppc/kernel/mk_defs.c linux-2.4.24/arch/ppc/kernel/mk_defs.c
427 --- linux-2.4.24.org/arch/ppc/kernel/mk_defs.c 2004-02-04 20:51:14.150661249 +0100
428 +++ linux-2.4.24/arch/ppc/kernel/mk_defs.c 2004-02-04 20:52:52.913119160 +0100
430 /*DEFINE(KERNELBASE, KERNELBASE);*/
431 DEFINE(STATE, offsetof(struct task_struct, state));
432 DEFINE(NEXT_TASK, offsetof(struct task_struct, next_task));
433 - DEFINE(COUNTER, offsetof(struct task_struct, counter));
434 - DEFINE(PROCESSOR, offsetof(struct task_struct, processor));
435 + DEFINE(COUNTER, offsetof(struct task_struct, time_slice));
436 + DEFINE(PROCESSOR, offsetof(struct task_struct, cpu));
437 DEFINE(SIGPENDING, offsetof(struct task_struct, sigpending));
438 DEFINE(THREAD, offsetof(struct task_struct, thread));
439 DEFINE(MM, offsetof(struct task_struct, mm));
440 diff -urN linux-2.4.24.org/arch/ppc/kernel/process.c linux-2.4.24/arch/ppc/kernel/process.c
441 --- linux-2.4.24.org/arch/ppc/kernel/process.c 2004-02-04 20:51:14.062679549 +0100
442 +++ linux-2.4.24/arch/ppc/kernel/process.c 2004-02-04 20:52:52.917118328 +0100
447 - printk(" CPU: %d", current->processor);
448 + printk(" CPU: %d", current->cpu);
449 #endif /* CONFIG_SMP */
452 diff -urN linux-2.4.24.org/arch/ppc/kernel/smp.c linux-2.4.24/arch/ppc/kernel/smp.c
453 --- linux-2.4.24.org/arch/ppc/kernel/smp.c 2004-02-04 20:51:15.993277992 +0100
454 +++ linux-2.4.24/arch/ppc/kernel/smp.c 2004-02-04 20:52:52.923117080 +0100
456 unsigned long cpu_online_map;
457 int smp_hw_index[NR_CPUS];
458 static struct smp_ops_t *smp_ops;
459 +unsigned long cache_decay_ticks = HZ/100;
461 /* all cpu mappings are 1-1 -- Cort */
462 volatile unsigned long cpu_callin_map[NR_CPUS];
464 * cpu 0, the master -- Cort
466 cpu_callin_map[0] = 1;
467 - current->processor = 0;
472 for (i = 0; i < NR_CPUS; i++) {
475 p = init_task.prev_task;
477 panic("No idle task for CPU %d", i);
478 - del_from_runqueue(p);
484 - p->cpus_runnable = 1 << i; /* we schedule the first task manually */
490 void __init smp_callin(void)
492 - int cpu = current->processor;
493 + int cpu = current->cpu;
495 smp_store_cpu_info(cpu);
496 smp_ops->setup_cpu(cpu);
497 diff -urN linux-2.4.24.org/arch/ppc/lib/dec_and_lock.c linux-2.4.24/arch/ppc/lib/dec_and_lock.c
498 --- linux-2.4.24.org/arch/ppc/lib/dec_and_lock.c 2004-02-04 20:51:18.406775995 +0100
499 +++ linux-2.4.24/arch/ppc/lib/dec_and_lock.c 2004-02-04 20:52:52.927116249 +0100
501 #include <linux/module.h>
502 +#include <linux/sched.h>
503 #include <linux/spinlock.h>
504 #include <asm/atomic.h>
505 #include <asm/system.h>
506 diff -urN linux-2.4.24.org/arch/ppc/mm/init.c linux-2.4.24/arch/ppc/mm/init.c
507 --- linux-2.4.24.org/arch/ppc/mm/init.c 2004-02-04 20:51:13.814731121 +0100
508 +++ linux-2.4.24/arch/ppc/mm/init.c 2004-02-04 20:52:52.931115417 +0100
513 - printk("%3d ", p->processor);
514 - if ( (p->processor != NO_PROC_ID) &&
515 - (p == current_set[p->processor]) )
516 + printk("%3d ", p->cpu);
517 + if ( (p->cpu != NO_PROC_ID) &&
518 + (p == current_set[p->cpu]) )
522 diff -urN linux-2.4.24.org/arch/ppc64/kernel/entry.S linux-2.4.24/arch/ppc64/kernel/entry.S
523 --- linux-2.4.24.org/arch/ppc64/kernel/entry.S 2004-02-04 20:50:43.056128805 +0100
524 +++ linux-2.4.24/arch/ppc64/kernel/entry.S 2004-02-04 20:53:40.136297052 +0100
528 _GLOBAL(ret_from_fork)
532 ld r4,PACACURRENT(r13)
533 ld r0,TASK_PTRACE(r4)
534 andi. r0,r0,PT_TRACESYS
535 diff -urN linux-2.4.24.org/arch/ppc64/kernel/idle.c linux-2.4.24/arch/ppc64/kernel/idle.c
536 --- linux-2.4.24.org/arch/ppc64/kernel/idle.c 2004-02-04 20:50:43.329072034 +0100
537 +++ linux-2.4.24/arch/ppc64/kernel/idle.c 2004-02-04 20:55:09.907625341 +0100
541 /* endless loop with no priority at all */
542 - current->nice = 20;
543 - current->counter = -100;
546 /* ensure iSeries run light will be out when idle */
547 current->thread.flags &= ~PPC_FLAG_RUN_LIGHT;
555 diff -urN linux-2.4.24.org/arch/ppc64/kernel/process.c linux-2.4.24/arch/ppc64/kernel/process.c
556 --- linux-2.4.24.org/arch/ppc64/kernel/process.c 2004-02-04 20:50:42.774187448 +0100
557 +++ linux-2.4.24/arch/ppc64/kernel/process.c 2004-02-04 20:52:52.986103980 +0100
559 #ifdef SHOW_TASK_SWITCHES
560 printk("%s/%d -> %s/%d NIP %08lx cpu %d root %x/%x\n",
561 prev->comm,prev->pid,
562 - new->comm,new->pid,new->thread.regs->nip,new->processor,
563 + new->comm,new->pid,new->thread.regs->nip,new->cpu,
564 new->fs->root,prev->fs->root);
567 diff -urN linux-2.4.24.org/arch/ppc64/kernel/smp.c linux-2.4.24/arch/ppc64/kernel/smp.c
568 --- linux-2.4.24.org/arch/ppc64/kernel/smp.c 2004-02-04 20:50:43.176103851 +0100
569 +++ linux-2.4.24/arch/ppc64/kernel/smp.c 2004-02-04 20:52:52.990103148 +0100
571 extern atomic_t ipi_sent;
572 spinlock_t kernel_flag __cacheline_aligned = SPIN_LOCK_UNLOCKED;
573 cycles_t cacheflush_time;
574 +unsigned long cache_decay_ticks = HZ/100;
575 static int max_cpus __initdata = NR_CPUS;
577 unsigned long cpu_online_map;
579 * cpu 0, the master -- Cort
581 cpu_callin_map[0] = 1;
582 - current->processor = 0;
587 for (i = 0; i < NR_CPUS; i++) {
588 paca[i].prof_counter = 1;
591 PPCDBG(PPCDBG_SMP,"\tProcessor %d, task = 0x%lx\n", i, p);
593 - del_from_runqueue(p);
599 - p->cpus_runnable = 1 << i; /* we schedule the first task manually */
600 current_set[i].task = p;
601 sp = ((unsigned long)p) + sizeof(union task_union)
602 - STACK_FRAME_OVERHEAD;
605 void __init smp_callin(void)
607 - int cpu = current->processor;
608 + int cpu = current->cpu;
610 smp_store_cpu_info(cpu);
611 set_dec(paca[cpu].default_decr);
614 ppc_md.smp_setup_cpu(cpu);
618 set_bit(smp_processor_id(), &cpu_online_map);
620 while(!smp_commenced) {
625 - cpu = current->processor;
626 + cpu = current->cpu;
627 atomic_inc(&init_mm.mm_count);
628 current->active_mm = &init_mm;
630 diff -urN linux-2.4.24.org/arch/s390/kernel/process.c linux-2.4.24/arch/s390/kernel/process.c
631 --- linux-2.4.24.org/arch/s390/kernel/process.c 2004-02-04 20:51:56.088938275 +0100
632 +++ linux-2.4.24/arch/s390/kernel/process.c 2004-02-04 20:52:52.994102316 +0100
635 /* endless idle loop with no priority at all */
637 - current->nice = 20;
638 - current->counter = -100;
642 if (current->need_resched) {
643 diff -urN linux-2.4.24.org/arch/s390x/kernel/process.c linux-2.4.24/arch/s390x/kernel/process.c
644 --- linux-2.4.24.org/arch/s390x/kernel/process.c 2004-02-04 20:52:03.781338295 +0100
645 +++ linux-2.4.24/arch/s390x/kernel/process.c 2004-02-04 20:52:52.997101692 +0100
648 /* endless idle loop with no priority at all */
650 - current->nice = 20;
651 - current->counter = -100;
655 if (current->need_resched) {
656 diff -urN linux-2.4.24.org/arch/sh/kernel/process.c linux-2.4.24/arch/sh/kernel/process.c
657 --- linux-2.4.24.org/arch/sh/kernel/process.c 2004-02-04 20:51:43.820490054 +0100
658 +++ linux-2.4.24/arch/sh/kernel/process.c 2004-02-04 20:52:53.000101068 +0100
661 /* endless idle loop with no priority at all */
663 - current->nice = 20;
664 - current->counter = -100;
668 diff -urN linux-2.4.24.org/arch/sparc/kernel/entry.S linux-2.4.24/arch/sparc/kernel/entry.S
669 --- linux-2.4.24.org/arch/sparc/kernel/entry.S 2004-02-04 20:50:51.877294031 +0100
670 +++ linux-2.4.24/arch/sparc/kernel/entry.S 2004-02-04 20:52:53.005100028 +0100
671 @@ -1471,7 +1471,9 @@
673 .globl C_LABEL(ret_from_fork)
674 C_LABEL(ret_from_fork):
679 b C_LABEL(ret_sys_call)
680 ld [%sp + STACKFRAME_SZ + PT_I0], %o0
681 diff -urN linux-2.4.24.org/arch/sparc/kernel/process.c linux-2.4.24/arch/sparc/kernel/process.c
682 --- linux-2.4.24.org/arch/sparc/kernel/process.c 2004-02-04 20:50:51.550362032 +0100
683 +++ linux-2.4.24/arch/sparc/kernel/process.c 2004-02-04 20:52:53.009099197 +0100
687 /* endless idle loop with no priority at all */
688 - current->nice = 20;
689 - current->counter = -100;
693 if (ARCH_SUN4C_SUN4) {
697 /* endless idle loop with no priority at all */
698 - current->nice = 20;
699 - current->counter = -100;
703 if(current->need_resched) {
704 diff -urN linux-2.4.24.org/arch/sparc/kernel/smp.c linux-2.4.24/arch/sparc/kernel/smp.c
705 --- linux-2.4.24.org/arch/sparc/kernel/smp.c 2004-02-04 20:50:51.522367854 +0100
706 +++ linux-2.4.24/arch/sparc/kernel/smp.c 2004-02-04 20:52:53.013098365 +0100
708 volatile int __cpu_number_map[NR_CPUS];
709 volatile int __cpu_logical_map[NR_CPUS];
710 cycles_t cacheflush_time = 0; /* XXX */
711 +unsigned long cache_decay_ticks = HZ/100; /* XXX */
713 /* The only guaranteed locking primitive available on all Sparc
714 * processors is 'ldstub [%reg + immediate], %dest_reg' which atomically
715 diff -urN linux-2.4.24.org/arch/sparc/kernel/sun4d_smp.c linux-2.4.24/arch/sparc/kernel/sun4d_smp.c
716 --- linux-2.4.24.org/arch/sparc/kernel/sun4d_smp.c 2004-02-04 20:50:51.254423586 +0100
717 +++ linux-2.4.24/arch/sparc/kernel/sun4d_smp.c 2004-02-04 20:52:53.027095454 +0100
719 * the SMP initialization the master will be just allowed
720 * to call the scheduler code.
724 /* Get our local ticker going. */
725 smp_setup_percpu_timer();
727 while((unsigned long)current_set[cpuid] < PAGE_OFFSET)
730 - while(current_set[cpuid]->processor != cpuid)
731 + while(current_set[cpuid]->cpu != cpuid)
734 /* Fix idle thread fields. */
737 __cpu_number_map[boot_cpu_id] = 0;
738 __cpu_logical_map[0] = boot_cpu_id;
739 - current->processor = boot_cpu_id;
740 smp_store_cpu_info(boot_cpu_id);
741 smp_setup_percpu_timer();
743 local_flush_cache_all();
744 if(linux_num_cpus == 1)
745 return; /* Not an MP box. */
746 @@ -222,14 +219,10 @@
749 p = init_task.prev_task;
753 - p->cpus_runnable = 1 << i; /* we schedule the first task manually */
757 - del_from_runqueue(p);
761 for (no = 0; no < linux_num_cpus; no++)
762 diff -urN linux-2.4.24.org/arch/sparc/kernel/sun4m_smp.c linux-2.4.24/arch/sparc/kernel/sun4m_smp.c
763 --- linux-2.4.24.org/arch/sparc/kernel/sun4m_smp.c 2004-02-04 20:50:52.194228110 +0100
764 +++ linux-2.4.24/arch/sparc/kernel/sun4m_smp.c 2004-02-04 20:52:53.030094830 +0100
766 * the SMP initialization the master will be just allowed
767 * to call the scheduler code.
771 /* Allow master to continue. */
772 swap((unsigned long *)&cpu_callin_map[cpuid], 1);
773 @@ -170,12 +169,10 @@
774 mid_xlate[boot_cpu_id] = (linux_cpus[boot_cpu_id].mid & ~8);
775 __cpu_number_map[boot_cpu_id] = 0;
776 __cpu_logical_map[0] = boot_cpu_id;
777 - current->processor = boot_cpu_id;
779 smp_store_cpu_info(boot_cpu_id);
780 set_irq_udt(mid_xlate[boot_cpu_id]);
781 smp_setup_percpu_timer();
783 local_flush_cache_all();
784 if(linux_num_cpus == 1)
785 return; /* Not an MP box. */
786 @@ -195,14 +192,10 @@
789 p = init_task.prev_task;
793 - p->cpus_runnable = 1 << i; /* we schedule the first task manually */
797 - del_from_runqueue(p);
801 /* See trampoline.S for details... */
802 diff -urN linux-2.4.24.org/arch/sparc64/kernel/entry.S linux-2.4.24/arch/sparc64/kernel/entry.S
803 --- linux-2.4.24.org/arch/sparc64/kernel/entry.S 2004-02-04 20:51:29.076556726 +0100
804 +++ linux-2.4.24/arch/sparc64/kernel/entry.S 2004-02-04 20:52:53.039092958 +0100
805 @@ -1627,7 +1627,9 @@
807 andn %o7, SPARC_FLAG_NEWCHILD, %l0
808 mov %g5, %o0 /* 'prev' */
812 stb %l0, [%g6 + AOFF_task_thread + AOFF_thread_flags]
813 andcc %l0, SPARC_FLAG_PERFCTR, %g0
815 diff -urN linux-2.4.24.org/arch/sparc64/kernel/irq.c linux-2.4.24/arch/sparc64/kernel/irq.c
816 --- linux-2.4.24.org/arch/sparc64/kernel/irq.c 2004-02-04 20:51:28.993573986 +0100
817 +++ linux-2.4.24/arch/sparc64/kernel/irq.c 2004-02-04 20:52:53.044091918 +0100
819 tid = ((tid & UPA_CONFIG_MID) << 9);
822 - tid = (starfire_translate(imap, current->processor) << 26);
823 + tid = (starfire_translate(imap, current->cpu) << 26);
827 diff -urN linux-2.4.24.org/arch/sparc64/kernel/process.c linux-2.4.24/arch/sparc64/kernel/process.c
828 --- linux-2.4.24.org/arch/sparc64/kernel/process.c 2004-02-04 20:51:29.998364993 +0100
829 +++ linux-2.4.24/arch/sparc64/kernel/process.c 2004-02-04 20:52:53.049090879 +0100
833 /* endless idle loop with no priority at all */
834 - current->nice = 20;
835 - current->counter = -100;
839 /* If current->need_resched is zero we should really
842 * the idle loop on a UltraMultiPenguin...
844 -#define idle_me_harder() (cpu_data[current->processor].idle_volume += 1)
845 -#define unidle_me() (cpu_data[current->processor].idle_volume = 0)
846 +#define idle_me_harder() (cpu_data[current->cpu].idle_volume += 1)
847 +#define unidle_me() (cpu_data[current->cpu].idle_volume = 0)
850 - current->nice = 20;
851 - current->counter = -100;
855 if (current->need_resched != 0) {
857 diff -urN linux-2.4.24.org/arch/sparc64/kernel/rtrap.S linux-2.4.24/arch/sparc64/kernel/rtrap.S
858 --- linux-2.4.24.org/arch/sparc64/kernel/rtrap.S 2004-02-04 20:51:29.910383293 +0100
859 +++ linux-2.4.24/arch/sparc64/kernel/rtrap.S 2004-02-04 20:52:53.053090047 +0100
862 .globl rtrap_clr_l6, rtrap, irqsz_patchme, rtrap_xcall
863 rtrap_clr_l6: clr %l6
864 -rtrap: lduw [%g6 + AOFF_task_processor], %l0
865 +rtrap: lduw [%g6 + AOFF_task_cpu], %l0
866 sethi %hi(irq_stat), %l2 ! &softirq_active
867 or %l2, %lo(irq_stat), %l2 ! &softirq_active
868 irqsz_patchme: sllx %l0, 0, %l0
869 diff -urN linux-2.4.24.org/arch/sparc64/kernel/smp.c linux-2.4.24/arch/sparc64/kernel/smp.c
870 --- linux-2.4.24.org/arch/sparc64/kernel/smp.c 2004-02-04 20:51:28.749624726 +0100
871 +++ linux-2.4.24/arch/sparc64/kernel/smp.c 2004-02-04 20:52:53.068086928 +0100
874 extern unsigned long sparc64_cpu_startup;
876 +static void __init smp_tune_scheduling(void);
878 /* The OBP cpu startup callback truncates the 3rd arg cookie to
879 * 32-bits (I think) so to be safe we have it read the pointer
880 * contained here so we work on >4GB machines. -DaveM
882 printk("Entering UltraSMPenguin Mode...\n");
884 smp_store_cpu_info(boot_cpu_id);
886 + smp_tune_scheduling();
888 if (linux_num_cpus == 1)
893 p = init_task.prev_task;
894 - init_tasks[cpucount] = p;
897 - p->cpus_runnable = 1UL << i; /* we schedule the first task manually */
899 - del_from_runqueue(p);
904 @@ -1214,10 +1210,96 @@
905 __cpu_number_map[boot_cpu_id] = 0;
906 prom_cpu_nodes[boot_cpu_id] = linux_cpus[0].prom_node;
907 __cpu_logical_map[0] = boot_cpu_id;
908 - current->processor = boot_cpu_id;
909 prof_counter(boot_cpu_id) = prof_multiplier(boot_cpu_id) = 1;
912 +cycles_t cacheflush_time;
913 +unsigned long cache_decay_ticks;
915 +extern unsigned long cheetah_tune_scheduling(void);
917 +static void __init smp_tune_scheduling(void)
919 + unsigned long orig_flush_base, flush_base, flags, *p;
920 + unsigned int ecache_size, order;
921 + cycles_t tick1, tick2, raw;
923 + /* Approximate heuristic for SMP scheduling. It is an
924 + * estimation of the time it takes to flush the L2 cache
925 + * on the local processor.
927 + * The ia32 chooses to use the L1 cache flush time instead,
928 + * and I consider this complete nonsense. The Ultra can service
929 + * a miss to the L1 with a hit to the L2 in 7 or 8 cycles, and
930 + * L2 misses are what create extra bus traffic (ie. the "cost"
931 + * of moving a process from one cpu to another).
933 + printk("SMP: Calibrating ecache flush... ");
934 + if (tlb_type == cheetah || tlb_type == cheetah_plus) {
935 + cacheflush_time = cheetah_tune_scheduling();
939 + ecache_size = prom_getintdefault(linux_cpus[0].prom_node,
940 + "ecache-size", (512 * 1024));
941 + if (ecache_size > (4 * 1024 * 1024))
942 + ecache_size = (4 * 1024 * 1024);
943 + orig_flush_base = flush_base =
944 + __get_free_pages(GFP_KERNEL, order = get_order(ecache_size));
946 + if (flush_base != 0UL) {
947 + local_irq_save(flags);
949 + /* Scan twice the size once just to get the TLB entries
950 + * loaded and make sure the second scan measures pure misses.
952 + for (p = (unsigned long *)flush_base;
953 + ((unsigned long)p) < (flush_base + (ecache_size<<1));
954 + p += (64 / sizeof(unsigned long)))
955 + *((volatile unsigned long *)p);
957 + tick1 = tick_ops->get_tick();
959 + __asm__ __volatile__("1:\n\t"
960 + "ldx [%0 + 0x000], %%g1\n\t"
961 + "ldx [%0 + 0x040], %%g2\n\t"
962 + "ldx [%0 + 0x080], %%g3\n\t"
963 + "ldx [%0 + 0x0c0], %%g5\n\t"
964 + "add %0, 0x100, %0\n\t"
966 + "bne,pt %%xcc, 1b\n\t"
968 + : "=&r" (flush_base)
969 + : "0" (flush_base),
970 + "r" (flush_base + ecache_size)
971 + : "g1", "g2", "g3", "g5");
973 + tick2 = tick_ops->get_tick();
975 + local_irq_restore(flags);
977 + raw = (tick2 - tick1);
979 + /* Dampen it a little, considering two processes
980 + * sharing the cache and fitting.
982 + cacheflush_time = (raw - (raw >> 2));
984 + free_pages(orig_flush_base, order);
986 + cacheflush_time = ((ecache_size << 2) +
987 + (ecache_size << 1));
990 + /* Convert ticks/sticks to jiffies. */
991 + cache_decay_ticks = cacheflush_time / timer_tick_offset;
992 + if (cache_decay_ticks < 1)
993 + cache_decay_ticks = 1;
995 + printk("Using heuristic of %ld cycles, %ld ticks.\n",
996 + cacheflush_time, cache_decay_ticks);
999 static inline unsigned long find_flush_base(unsigned long size)
1001 struct page *p = mem_map;
1002 diff -urN linux-2.4.24.org/arch/sparc64/kernel/trampoline.S linux-2.4.24/arch/sparc64/kernel/trampoline.S
1003 --- linux-2.4.24.org/arch/sparc64/kernel/trampoline.S 2004-02-04 20:51:29.425484150 +0100
1004 +++ linux-2.4.24/arch/sparc64/kernel/trampoline.S 2004-02-04 20:52:53.073085888 +0100
1006 wrpr %o1, PSTATE_IG, %pstate
1008 /* Get our UPA MID. */
1009 - lduw [%o2 + AOFF_task_processor], %g1
1010 + lduw [%o2 + AOFF_task_cpu], %g1
1011 sethi %hi(cpu_data), %g5
1012 or %g5, %lo(cpu_data), %g5
1014 diff -urN linux-2.4.24.org/arch/sparc64/kernel/traps.c linux-2.4.24/arch/sparc64/kernel/traps.c
1015 --- linux-2.4.24.org/arch/sparc64/kernel/traps.c 2004-02-04 20:51:28.672640738 +0100
1016 +++ linux-2.4.24/arch/sparc64/kernel/traps.c 2004-02-04 20:52:53.078084848 +0100
1018 #include <linux/smp.h>
1019 #include <linux/smp_lock.h>
1020 #include <linux/mm.h>
1021 +#include <linux/init.h>
1023 #include <asm/delay.h>
1024 #include <asm/system.h>
1025 @@ -755,6 +756,48 @@
1026 "i" (ASI_PHYS_USE_EC));
1030 +unsigned long __init cheetah_tune_scheduling(void)
1032 + unsigned long tick1, tick2, raw;
1033 + unsigned long flush_base = ecache_flush_physbase;
1034 + unsigned long flush_linesize = ecache_flush_linesize;
1035 + unsigned long flush_size = ecache_flush_size;
1037 + /* Run through the whole cache to guarentee the timed loop
1038 + * is really displacing cache lines.
1040 + __asm__ __volatile__("1: subcc %0, %4, %0\n\t"
1041 + " bne,pt %%xcc, 1b\n\t"
1042 + " ldxa [%2 + %0] %3, %%g0\n\t"
1043 + : "=&r" (flush_size)
1044 + : "0" (flush_size), "r" (flush_base),
1045 + "i" (ASI_PHYS_USE_EC), "r" (flush_linesize));
1047 + /* The flush area is 2 X Ecache-size, so cut this in half for
1050 + flush_base = ecache_flush_physbase;
1051 + flush_linesize = ecache_flush_linesize;
1052 + flush_size = ecache_flush_size >> 1;
1054 + __asm__ __volatile__("rd %%tick, %0" : "=r" (tick1));
1056 + __asm__ __volatile__("1: subcc %0, %4, %0\n\t"
1057 + " bne,pt %%xcc, 1b\n\t"
1058 + " ldxa [%2 + %0] %3, %%g0\n\t"
1059 + : "=&r" (flush_size)
1060 + : "0" (flush_size), "r" (flush_base),
1061 + "i" (ASI_PHYS_USE_EC), "r" (flush_linesize));
1063 + __asm__ __volatile__("rd %%tick, %0" : "=r" (tick2));
1065 + raw = (tick2 - tick1);
1067 + return (raw - (raw >> 2));
1071 /* Unfortunately, the diagnostic access to the I-cache tags we need to
1072 * use to clear the thing interferes with I-cache coherency transactions.
1074 diff -urN linux-2.4.24.org/Documentation/sched-coding.txt linux-2.4.24/Documentation/sched-coding.txt
1075 --- linux-2.4.24.org/Documentation/sched-coding.txt 1970-01-01 01:00:00.000000000 +0100
1076 +++ linux-2.4.24/Documentation/sched-coding.txt 2004-02-04 20:52:53.082084016 +0100
1078 + Reference for various scheduler-related methods in the O(1) scheduler
1079 + Robert Love <rml@tech9.net>, MontaVista Software
1082 +Note most of these methods are local to kernel/sched.c - this is by design.
1083 +The scheduler is meant to be self-contained and abstracted away. This document
1084 +is primarily for understanding the scheduler, not interfacing to it. Some of
1085 +the discussed interfaces, however, are general process/scheduling methods.
1086 +They are typically defined in include/linux/sched.h.
1089 +Main Scheduling Methods
1090 +-----------------------
1092 +void load_balance(runqueue_t *this_rq, int idle)
1093 + Attempts to pull tasks from one cpu to another to balance cpu usage,
1094 + if needed. This method is called explicitly if the runqueues are
1095 + inbalanced or periodically by the timer tick. Prior to calling,
1096 + the current runqueue must be locked and interrupts disabled.
1099 + The main scheduling function. Upon return, the highest priority
1100 + process will be active.
1106 +Each runqueue has its own lock, rq->lock. When multiple runqueues need
1107 +to be locked, lock acquires must be ordered by ascending &runqueue value.
1109 +A specific runqueue is locked via
1111 + task_rq_lock(task_t pid, unsigned long *flags)
1113 +which disables preemption, disables interrupts, and locks the runqueue pid is
1114 +running on. Likewise,
1116 + task_rq_unlock(task_t pid, unsigned long *flags)
1118 +unlocks the runqueue pid is running on, restores interrupts to their previous
1119 +state, and reenables preemption.
1123 + double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1127 + double_rq_unlock(runqueue_t *rq1, runqueue_t rq2)
1129 +safely lock and unlock, respectively, the two specified runqueues. They do
1130 +not, however, disable and restore interrupts. Users are required to do so
1131 +manually before and after calls.
1138 + The maximum priority of the system, stored in the task as task->prio.
1139 + Lower priorities are higher. Normal (non-RT) priorities range from
1140 + MAX_RT_PRIO to (MAX_PRIO - 1).
1142 + The maximum real-time priority of the system. Valid RT priorities
1143 + range from 0 to (MAX_RT_PRIO - 1).
1145 + The maximum real-time priority that is exported to user-space. Should
1146 + always be equal to or less than MAX_RT_PRIO. Setting it less allows
1147 + kernel threads to have higher priorities than any user-space task.
1150 + Respectively, the minimum and maximum timeslices (quanta) of a process.
1156 + The main per-CPU runqueue data structure.
1158 + The main per-process data structure.
1165 + Returns the runqueue of the specified cpu.
1167 + Returns the runqueue of the current cpu.
1169 + Returns the runqueue which holds the specified pid.
1171 + Returns the task currently running on the given cpu.
1173 + Returns true if pid is real-time, false if not.
1176 +Process Control Methods
1177 +-----------------------
1179 +void set_user_nice(task_t *p, long nice)
1180 + Sets the "nice" value of task p to the given value.
1181 +int setscheduler(pid_t pid, int policy, struct sched_param *param)
1182 + Sets the scheduling policy and parameters for the given pid.
1183 +void set_cpus_allowed(task_t *p, unsigned long new_mask)
1184 + Sets a given task's CPU affinity and migrates it to a proper cpu.
1185 + Callers must have a valid reference to the task and assure the
1186 + task not exit prematurely. No locks can be held during the call.
1187 +set_task_state(tsk, state_value)
1188 + Sets the given task's state to the given value.
1189 +set_current_state(state_value)
1190 + Sets the current task's state to the given value.
1191 +void set_tsk_need_resched(struct task_struct *tsk)
1192 + Sets need_resched in the given task.
1193 +void clear_tsk_need_resched(struct task_struct *tsk)
1194 + Clears need_resched in the given task.
1195 +void set_need_resched()
1196 + Sets need_resched in the current task.
1197 +void clear_need_resched()
1198 + Clears need_resched in the current task.
1200 + Returns true if need_resched is set in the current task, false
1203 + Place the current process at the end of the runqueue and call schedule.
1204 diff -urN linux-2.4.24.org/Documentation/sched-design.txt linux-2.4.24/Documentation/sched-design.txt
1205 --- linux-2.4.24.org/Documentation/sched-design.txt 1970-01-01 01:00:00.000000000 +0100
1206 +++ linux-2.4.24/Documentation/sched-design.txt 2004-02-04 20:52:53.088082769 +0100
1208 + Goals, Design and Implementation of the
1209 + new ultra-scalable O(1) scheduler
1212 + This is an edited version of an email Ingo Molnar sent to
1213 + lkml on 4 Jan 2002. It describes the goals, design, and
1214 + implementation of Ingo's new ultra-scalable O(1) scheduler.
1215 + Last Updated: 18 April 2002.
1221 +The main goal of the new scheduler is to keep all the good things we know
1222 +and love about the current Linux scheduler:
1224 + - good interactive performance even during high load: if the user
1225 + types or clicks then the system must react instantly and must execute
1226 + the user tasks smoothly, even during considerable background load.
1228 + - good scheduling/wakeup performance with 1-2 runnable processes.
1230 + - fairness: no process should stay without any timeslice for any
1231 + unreasonable amount of time. No process should get an unjustly high
1232 + amount of CPU time.
1234 + - priorities: less important tasks can be started with lower priority,
1235 + more important tasks with higher priority.
1237 + - SMP efficiency: no CPU should stay idle if there is work to do.
1239 + - SMP affinity: processes which run on one CPU should stay affine to
1240 + that CPU. Processes should not bounce between CPUs too frequently.
1242 + - plus additional scheduler features: RT scheduling, CPU binding.
1244 +and the goal is also to add a few new things:
1246 + - fully O(1) scheduling. Are you tired of the recalculation loop
1247 + blowing the L1 cache away every now and then? Do you think the goodness
1248 + loop is taking a bit too long to finish if there are lots of runnable
1249 + processes? This new scheduler takes no prisoners: wakeup(), schedule(),
1250 + the timer interrupt are all O(1) algorithms. There is no recalculation
1251 + loop. There is no goodness loop either.
1253 + - 'perfect' SMP scalability. With the new scheduler there is no 'big'
1254 + runqueue_lock anymore - it's all per-CPU runqueues and locks - two
1255 + tasks on two separate CPUs can wake up, schedule and context-switch
1256 + completely in parallel, without any interlocking. All
1257 + scheduling-relevant data is structured for maximum scalability.
1259 + - better SMP affinity. The old scheduler has a particular weakness that
1260 + causes the random bouncing of tasks between CPUs if/when higher
1261 + priority/interactive tasks, this was observed and reported by many
1262 + people. The reason is that the timeslice recalculation loop first needs
1263 + every currently running task to consume its timeslice. But when this
1264 + happens on eg. an 8-way system, then this property starves an
1265 + increasing number of CPUs from executing any process. Once the last
1266 + task that has a timeslice left has finished using up that timeslice,
1267 + the recalculation loop is triggered and other CPUs can start executing
1268 + tasks again - after having idled around for a number of timer ticks.
1269 + The more CPUs, the worse this effect.
1271 + Furthermore, this same effect causes the bouncing effect as well:
1272 + whenever there is such a 'timeslice squeeze' of the global runqueue,
1273 + idle processors start executing tasks which are not affine to that CPU.
1274 + (because the affine tasks have finished off their timeslices already.)
1276 + The new scheduler solves this problem by distributing timeslices on a
1277 + per-CPU basis, without having any global synchronization or
1280 + - batch scheduling. A significant proportion of computing-intensive tasks
1281 + benefit from batch-scheduling, where timeslices are long and processes
1282 + are roundrobin scheduled. The new scheduler does such batch-scheduling
1283 + of the lowest priority tasks - so nice +19 jobs will get
1284 + 'batch-scheduled' automatically. With this scheduler, nice +19 jobs are
1285 + in essence SCHED_IDLE, from an interactiveness point of view.
1287 + - handle extreme loads more smoothly, without breakdown and scheduling
1290 + - O(1) RT scheduling. For those RT folks who are paranoid about the
1291 + O(nr_running) property of the goodness loop and the recalculation loop.
1293 + - run fork()ed children before the parent. Andrea has pointed out the
1294 + advantages of this a few months ago, but patches for this feature
1295 + do not work with the old scheduler as well as they should,
1296 + because idle processes often steal the new child before the fork()ing
1297 + CPU gets to execute it.
1303 +the core of the new scheduler are the following mechanizms:
1305 + - *two*, priority-ordered 'priority arrays' per CPU. There is an 'active'
1306 + array and an 'expired' array. The active array contains all tasks that
1307 + are affine to this CPU and have timeslices left. The expired array
1308 + contains all tasks which have used up their timeslices - but this array
1309 + is kept sorted as well. The active and expired array is not accessed
1310 + directly, it's accessed through two pointers in the per-CPU runqueue
1311 + structure. If all active tasks are used up then we 'switch' the two
1312 + pointers and from now on the ready-to-go (former-) expired array is the
1313 + active array - and the empty active array serves as the new collector
1314 + for expired tasks.
1316 + - there is a 64-bit bitmap cache for array indices. Finding the highest
1317 + priority task is thus a matter of two x86 BSFL bit-search instructions.
1319 +the split-array solution enables us to have an arbitrary number of active
1320 +and expired tasks, and the recalculation of timeslices can be done
1321 +immediately when the timeslice expires. Because the arrays are always
1322 +access through the pointers in the runqueue, switching the two arrays can
1323 +be done very quickly.
1325 +this is a hybride priority-list approach coupled with roundrobin
1326 +scheduling and the array-switch method of distributing timeslices.
1328 + - there is a per-task 'load estimator'.
1330 +one of the toughest things to get right is good interactive feel during
1331 +heavy system load. While playing with various scheduler variants i found
1332 +that the best interactive feel is achieved not by 'boosting' interactive
1333 +tasks, but by 'punishing' tasks that want to use more CPU time than there
1334 +is available. This method is also much easier to do in an O(1) fashion.
1336 +to establish the actual 'load' the task contributes to the system, a
1337 +complex-looking but pretty accurate method is used: there is a 4-entry
1338 +'history' ringbuffer of the task's activities during the last 4 seconds.
1339 +This ringbuffer is operated without much overhead. The entries tell the
1340 +scheduler a pretty accurate load-history of the task: has it used up more
1341 +CPU time or less during the past N seconds. [the size '4' and the interval
1342 +of 4x 1 seconds was found by lots of experimentation - this part is
1343 +flexible and can be changed in both directions.]
1345 +the penalty a task gets for generating more load than the CPU can handle
1346 +is a priority decrease - there is a maximum amount to this penalty
1347 +relative to their static priority, so even fully CPU-bound tasks will
1348 +observe each other's priorities, and will share the CPU accordingly.
1350 +the SMP load-balancer can be extended/switched with additional parallel
1351 +computing and cache hierarchy concepts: NUMA scheduling, multi-core CPUs
1352 +can be supported easily by changing the load-balancer. Right now it's
1353 +tuned for my SMP systems.
1355 +i skipped the prev->mm == next->mm advantage - no workload i know of shows
1356 +any sensitivity to this. It can be added back by sacrificing O(1)
1357 +schedule() [the current and one-lower priority list can be searched for a
1358 +that->mm == current->mm condition], but costs a fair number of cycles
1359 +during a number of important workloads, so i wanted to avoid this as much
1362 +- the SMP idle-task startup code was still racy and the new scheduler
1363 +triggered this. So i streamlined the idle-setup code a bit. We do not call
1364 +into schedule() before all processors have started up fully and all idle
1365 +threads are in place.
1367 +- the patch also cleans up a number of aspects of sched.c - moves code
1368 +into other areas of the kernel where it's appropriate, and simplifies
1369 +certain code paths and data constructs. As a result, the new scheduler's
1370 +code is smaller than the old one.
1373 diff -urN linux-2.4.24.org/drivers/char/drm-4.0/tdfx_drv.c linux-2.4.24/drivers/char/drm-4.0/tdfx_drv.c
1374 --- linux-2.4.24.org/drivers/char/drm-4.0/tdfx_drv.c 2004-02-04 20:49:21.677055474 +0100
1375 +++ linux-2.4.24/drivers/char/drm-4.0/tdfx_drv.c 2004-02-04 20:52:53.236051992 +0100
1377 lock.context, current->pid, j,
1378 dev->lock.lock_time, jiffies);
1379 current->state = TASK_INTERRUPTIBLE;
1380 - current->policy |= SCHED_YIELD;
1381 schedule_timeout(DRM_LOCK_SLICE-j);
1382 DRM_DEBUG("jiffies=%d\n", jiffies);
1384 diff -urN linux-2.4.24.org/drivers/char/mwave/mwavedd.c linux-2.4.24/drivers/char/mwave/mwavedd.c
1385 --- linux-2.4.24.org/drivers/char/mwave/mwavedd.c 2004-02-04 20:49:18.334750669 +0100
1386 +++ linux-2.4.24/drivers/char/mwave/mwavedd.c 2004-02-04 20:52:53.321034316 +0100
1388 pDrvData->IPCs[ipcnum].bIsHere = FALSE;
1389 pDrvData->IPCs[ipcnum].bIsEnabled = TRUE;
1390 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
1391 - current->nice = -20; /* boost to provide priority timing */
1393 current->priority = 0x28; /* boost to provide priority timing */
1395 diff -urN linux-2.4.24.org/drivers/char/serial_txx927.c linux-2.4.24/drivers/char/serial_txx927.c
1396 --- linux-2.4.24.org/drivers/char/serial_txx927.c 2004-02-04 20:49:11.902088655 +0100
1397 +++ linux-2.4.24/drivers/char/serial_txx927.c 2004-02-04 20:52:53.361025998 +0100
1398 @@ -1533,7 +1533,6 @@
1399 printk("cisr = %d (jiff=%lu)...", cisr, jiffies);
1401 current->state = TASK_INTERRUPTIBLE;
1402 - current->counter = 0; /* make us low-priority */
1403 schedule_timeout(char_time);
1404 if (signal_pending(current))
1406 diff -urN linux-2.4.24.org/drivers/md/md.c linux-2.4.24/drivers/md/md.c
1407 --- linux-2.4.24.org/drivers/md/md.c 2004-02-04 20:50:32.930234961 +0100
1408 +++ linux-2.4.24/drivers/md/md.c 2004-02-04 20:52:53.369024334 +0100
1409 @@ -2939,8 +2939,6 @@
1410 * bdflush, otherwise bdflush will deadlock if there are too
1411 * many dirty RAID5 blocks.
1413 - current->policy = SCHED_OTHER;
1414 - current->nice = -20;
1417 complete(thread->event);
1418 @@ -3464,11 +3462,6 @@
1419 "(but not more than %d KB/sec) for reconstruction.\n",
1420 sysctl_speed_limit_max);
1423 - * Resync has low priority.
1425 - current->nice = 19;
1427 is_mddev_idle(mddev); /* this also initializes IO event counters */
1428 for (m = 0; m < SYNC_MARKS; m++) {
1430 @@ -3546,16 +3539,13 @@
1431 currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
1433 if (currspeed > sysctl_speed_limit_min) {
1434 - current->nice = 19;
1436 if ((currspeed > sysctl_speed_limit_max) ||
1437 !is_mddev_idle(mddev)) {
1438 current->state = TASK_INTERRUPTIBLE;
1439 md_schedule_timeout(HZ/4);
1443 - current->nice = -20;
1446 printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
1448 diff -urN linux-2.4.24.org/fs/binfmt_elf.c linux-2.4.24/fs/binfmt_elf.c
1449 --- linux-2.4.24.org/fs/binfmt_elf.c 2004-02-04 20:47:14.464515701 +0100
1450 +++ linux-2.4.24/fs/binfmt_elf.c 2004-02-04 20:52:53.390019967 +0100
1451 @@ -1173,7 +1173,7 @@
1452 psinfo.pr_state = i;
1453 psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i];
1454 psinfo.pr_zomb = psinfo.pr_sname == 'Z';
1455 - psinfo.pr_nice = current->nice;
1456 + psinfo.pr_nice = task_nice(current);
1457 psinfo.pr_flag = current->flags;
1458 psinfo.pr_uid = NEW_TO_OLD_UID(current->uid);
1459 psinfo.pr_gid = NEW_TO_OLD_GID(current->gid);
1460 diff -urN linux-2.4.24.org/fs/jffs2/background.c linux-2.4.24/fs/jffs2/background.c
1461 --- linux-2.4.24.org/fs/jffs2/background.c 2004-02-04 20:47:24.029526165 +0100
1462 +++ linux-2.4.24/fs/jffs2/background.c 2004-02-04 20:52:53.418014145 +0100
1465 sprintf(current->comm, "jffs2_gcd_mtd%d", c->mtd->index);
1467 - /* FIXME in the 2.2 backport */
1468 - current->nice = 10;
1471 spin_lock_irq(¤t->sigmask_lock);
1472 siginitsetinv (¤t->blocked, sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT));
1473 diff -urN linux-2.4.24.org/fs/proc/array.c linux-2.4.24/fs/proc/array.c
1474 --- linux-2.4.24.org/fs/proc/array.c 2004-02-04 20:47:14.980408395 +0100
1475 +++ linux-2.4.24/fs/proc/array.c 2004-02-04 20:52:53.447008114 +0100
1478 /* scale priority and nice values from timeslices to -20..20 */
1479 /* to make it look like a "normal" Unix priority/nice value */
1480 - priority = task->counter;
1481 - priority = 20 - (priority * 10 + DEF_COUNTER / 2) / DEF_COUNTER;
1482 - nice = task->nice;
1483 + priority = task_prio(task);
1484 + nice = task_nice(task);
1486 read_lock(&tasklist_lock);
1487 ppid = task->pid ? task->p_opptr->pid : 0;
1497 diff -urN linux-2.4.24.org/fs/proc/proc_misc.c linux-2.4.24/fs/proc/proc_misc.c
1498 --- linux-2.4.24.org/fs/proc/proc_misc.c 2004-02-04 20:47:14.897425655 +0100
1499 +++ linux-2.4.24/fs/proc/proc_misc.c 2004-02-04 20:52:53.485000212 +0100
1500 @@ -109,11 +109,11 @@
1501 a = avenrun[0] + (FIXED_1/200);
1502 b = avenrun[1] + (FIXED_1/200);
1503 c = avenrun[2] + (FIXED_1/200);
1504 - len = sprintf(page,"%d.%02d %d.%02d %d.%02d %d/%d %d\n",
1505 + len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
1506 LOAD_INT(a), LOAD_FRAC(a),
1507 LOAD_INT(b), LOAD_FRAC(b),
1508 LOAD_INT(c), LOAD_FRAC(c),
1509 - nr_running, nr_threads, last_pid);
1510 + nr_running(), nr_threads, last_pid);
1511 return proc_calc_metrics(page, start, off, count, eof, len);
1518 - idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;
1519 + idle = init_task.times.tms_utime + init_task.times.tms_stime;
1521 /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
1522 that would overflow about every five days at HZ == 100.
1523 @@ -374,10 +374,10 @@
1526 proc_sprintf(page, &off, &len,
1531 - kstat.context_swtch,
1532 + nr_context_switches(),
1533 xtime.tv_sec - jif / HZ,
1536 diff -urN linux-2.4.24.org/fs/reiserfs/buffer2.c linux-2.4.24/fs/reiserfs/buffer2.c
1537 --- linux-2.4.24.org/fs/reiserfs/buffer2.c 2004-02-04 20:47:23.322673191 +0100
1538 +++ linux-2.4.24/fs/reiserfs/buffer2.c 2004-02-04 20:52:53.511994597 +0100
1540 struct buffer_head * reiserfs_bread (struct super_block *super, int n_block, int n_size)
1542 struct buffer_head *result;
1543 - PROC_EXP( unsigned int ctx_switches = kstat.context_swtch );
1544 + PROC_EXP( unsigned int ctx_switches = nr_context_switches(); );
1546 result = bread (super -> s_dev, n_block, n_size);
1547 PROC_INFO_INC( super, breads );
1548 - PROC_EXP( if( kstat.context_swtch != ctx_switches )
1549 + PROC_EXP( if( nr_context_switches() != ctx_switches )
1550 PROC_INFO_INC( super, bread_miss ) );
1553 diff -urN linux-2.4.24.org/include/asm-alpha/bitops.h linux-2.4.24/include/asm-alpha/bitops.h
1554 --- linux-2.4.24.org/include/asm-alpha/bitops.h 2004-02-04 20:47:46.527846489 +0100
1555 +++ linux-2.4.24/include/asm-alpha/bitops.h 2004-02-04 20:52:53.537989191 +0100
1558 #include <linux/config.h>
1559 #include <linux/kernel.h>
1560 +#include <asm/compiler.h>
1563 * Copyright 1994, Linus Torvalds.
1566 __asm__ __volatile__(
1575 :"=&r" (temp), "=m" (*m)
1576 - :"Ir" (~(1UL << (nr & 31))), "m" (*m));
1577 + :"Ir" (1UL << (nr & 31)), "m" (*m));
1581 * WARNING: non atomic version.
1583 static __inline__ void
1584 -__change_bit(unsigned long nr, volatile void * addr)
1585 +__clear_bit(unsigned long nr, volatile void * addr)
1587 int *m = ((int *) addr) + (nr >> 5);
1589 - *m ^= 1 << (nr & 31);
1590 + *m &= ~(1 << (nr & 31));
1595 :"Ir" (1UL << (nr & 31)), "m" (*m));
1599 + * WARNING: non atomic version.
1601 +static __inline__ void
1602 +__change_bit(unsigned long nr, volatile void * addr)
1604 + int *m = ((int *) addr) + (nr >> 5);
1606 + *m ^= 1 << (nr & 31);
1610 test_and_set_bit(unsigned long nr, volatile void *addr)
1612 @@ -181,20 +193,6 @@
1613 return (old & mask) != 0;
1617 - * WARNING: non atomic version.
1619 -static __inline__ int
1620 -__test_and_change_bit(unsigned long nr, volatile void * addr)
1622 - unsigned long mask = 1 << (nr & 0x1f);
1623 - int *m = ((int *) addr) + (nr >> 5);
1627 - return (old & mask) != 0;
1631 test_and_change_bit(unsigned long nr, volatile void * addr)
1633 @@ -220,6 +218,20 @@
1638 + * WARNING: non atomic version.
1640 +static __inline__ int
1641 +__test_and_change_bit(unsigned long nr, volatile void * addr)
1643 + unsigned long mask = 1 << (nr & 0x1f);
1644 + int *m = ((int *) addr) + (nr >> 5);
1648 + return (old & mask) != 0;
1652 test_bit(int nr, volatile void * addr)
1654 @@ -235,12 +247,15 @@
1656 static inline unsigned long ffz_b(unsigned long x)
1658 - unsigned long sum = 0;
1659 + unsigned long sum, x1, x2, x4;
1661 x = ~x & -~x; /* set first 0 bit, clear others */
1662 - if (x & 0xF0) sum += 4;
1663 - if (x & 0xCC) sum += 2;
1664 - if (x & 0xAA) sum += 1;
1669 + sum += (x4 != 0) * 4;
1674 @@ -257,24 +272,46 @@
1676 __asm__("cmpbge %1,%2,%0" : "=r"(bits) : "r"(word), "r"(~0UL));
1678 - __asm__("extbl %1,%2,%0" : "=r"(bits) : "r"(word), "r"(qofs));
1679 + bits = __kernel_extbl(word, qofs);
1682 return qofs*8 + bofs;
1687 + * __ffs = Find First set bit in word. Undefined if no set bit exists.
1689 +static inline unsigned long __ffs(unsigned long word)
1691 +#if defined(__alpha_cix__) && defined(__alpha_fix__)
1692 + /* Whee. EV67 can calculate it directly. */
1693 + unsigned long result;
1694 + __asm__("cttz %1,%0" : "=r"(result) : "r"(word));
1697 + unsigned long bits, qofs, bofs;
1699 + __asm__("cmpbge $31,%1,%0" : "=r"(bits) : "r"(word));
1700 + qofs = ffz_b(bits);
1701 + bits = __kernel_extbl(word, qofs);
1702 + bofs = ffz_b(~bits);
1704 + return qofs*8 + bofs;
1711 * ffs: find first bit set. This is defined the same way as
1712 * the libc and compiler builtin ffs routines, therefore
1713 - * differs in spirit from the above ffz (man ffs).
1714 + * differs in spirit from the above __ffs.
1717 static inline int ffs(int word)
1719 - int result = ffz(~word);
1720 + int result = __ffs(word);
1721 return word ? result+1 : 0;
1724 @@ -316,6 +353,14 @@
1725 #define hweight16(x) hweight64((x) & 0xfffful)
1726 #define hweight8(x) hweight64((x) & 0xfful)
1728 +static inline unsigned long hweight64(unsigned long w)
1730 + unsigned long result;
1731 + for (result = 0; w ; w >>= 1)
1732 + result += (w & 1);
1736 #define hweight32(x) generic_hweight32(x)
1737 #define hweight16(x) generic_hweight16(x)
1738 #define hweight8(x) generic_hweight8(x)
1739 @@ -365,13 +410,77 @@
1743 - * The optimizer actually does good code for this case..
1744 + * Find next one bit in a bitmap reasonably efficiently.
1746 +static inline unsigned long
1747 +find_next_bit(void * addr, unsigned long size, unsigned long offset)
1749 + unsigned long * p = ((unsigned long *) addr) + (offset >> 6);
1750 + unsigned long result = offset & ~63UL;
1751 + unsigned long tmp;
1753 + if (offset >= size)
1759 + tmp &= ~0UL << offset;
1763 + goto found_middle;
1767 + while (size & ~63UL) {
1768 + if ((tmp = *(p++)))
1769 + goto found_middle;
1777 + tmp &= ~0UL >> (64 - size);
1779 + return result + size;
1781 + return result + __ffs(tmp);
1785 + * The optimizer actually does good code for this case.
1787 #define find_first_zero_bit(addr, size) \
1788 find_next_zero_bit((addr), (size), 0)
1789 +#define find_first_bit(addr, size) \
1790 + find_next_bit((addr), (size), 0)
1795 + * Every architecture must define this function. It's the fastest
1796 + * way of searching a 140-bit bitmap where the first 100 bits are
1797 + * unlikely to be set. It's guaranteed that at least one of the 140
1800 +static inline unsigned long
1801 +sched_find_first_bit(unsigned long b[3])
1803 + unsigned long b0 = b[0], b1 = b[1], b2 = b[2];
1804 + unsigned long ofs;
1806 + ofs = (b1 ? 64 : 128);
1807 + b1 = (b1 ? b1 : b2);
1808 + ofs = (b0 ? 0 : ofs);
1809 + b0 = (b0 ? b0 : b1);
1811 + return __ffs(b0) + ofs;
1815 #define ext2_set_bit __test_and_set_bit
1816 #define ext2_clear_bit __test_and_clear_bit
1817 #define ext2_test_bit test_bit
1818 diff -urN linux-2.4.24.org/include/asm-alpha/smp.h linux-2.4.24/include/asm-alpha/smp.h
1819 --- linux-2.4.24.org/include/asm-alpha/smp.h 2004-02-04 20:47:46.648821326 +0100
1820 +++ linux-2.4.24/include/asm-alpha/smp.h 2004-02-04 20:52:53.540988567 +0100
1822 #define cpu_logical_map(cpu) __cpu_logical_map[cpu]
1824 #define hard_smp_processor_id() __hard_smp_processor_id()
1825 -#define smp_processor_id() (current->processor)
1826 +#define smp_processor_id() (current->cpu)
1828 extern unsigned long cpu_present_mask;
1829 #define cpu_online_map cpu_present_mask
1830 diff -urN linux-2.4.24.org/include/asm-alpha/system.h linux-2.4.24/include/asm-alpha/system.h
1831 --- linux-2.4.24.org/include/asm-alpha/system.h 2004-02-04 20:47:45.924971887 +0100
1832 +++ linux-2.4.24/include/asm-alpha/system.h 2004-02-04 20:52:53.545987527 +0100
1834 extern void halt(void) __attribute__((noreturn));
1835 #define __halt() __asm__ __volatile__ ("call_pal %0 #halt" : : "i" (PAL_halt))
1837 -#define prepare_to_switch() do { } while(0)
1838 #define switch_to(prev,next,last) \
1840 unsigned long pcbb; \
1841 diff -urN linux-2.4.24.org/include/asm-arm/bitops.h linux-2.4.24/include/asm-arm/bitops.h
1842 --- linux-2.4.24.org/include/asm-arm/bitops.h 2004-02-04 20:48:05.614876374 +0100
1843 +++ linux-2.4.24/include/asm-arm/bitops.h 2004-02-04 20:52:53.589978377 +0100
1845 * Copyright 1995, Russell King.
1846 * Various bits and pieces copyrights include:
1847 * Linus Torvalds (test_bit).
1848 + * Big endian support: Copyright 2001, Nicolas Pitre
1849 + * reworked by rmk.
1851 * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
1853 @@ -17,81 +19,271 @@
1857 +#include <asm/system.h>
1859 #define smp_mb__before_clear_bit() do { } while (0)
1860 #define smp_mb__after_clear_bit() do { } while (0)
1863 - * Function prototypes to keep gcc -Wall happy.
1864 + * These functions are the basis of our bit ops.
1865 + * First, the atomic bitops.
1867 + * The endian issue for these functions is handled by the macros below.
1869 -extern void set_bit(int nr, volatile void * addr);
1871 +____atomic_set_bit_mask(unsigned int mask, volatile unsigned char *p)
1873 + unsigned long flags;
1875 + local_irq_save(flags);
1877 + local_irq_restore(flags);
1881 +____atomic_clear_bit_mask(unsigned int mask, volatile unsigned char *p)
1883 + unsigned long flags;
1885 + local_irq_save(flags);
1887 + local_irq_restore(flags);
1891 +____atomic_change_bit_mask(unsigned int mask, volatile unsigned char *p)
1893 + unsigned long flags;
1895 + local_irq_save(flags);
1897 + local_irq_restore(flags);
1900 -static inline void __set_bit(int nr, volatile void *addr)
1902 +____atomic_test_and_set_bit_mask(unsigned int mask, volatile unsigned char *p)
1904 - ((unsigned char *) addr)[nr >> 3] |= (1U << (nr & 7));
1905 + unsigned long flags;
1908 + local_irq_save(flags);
1911 + local_irq_restore(flags);
1913 + return res & mask;
1916 -extern void clear_bit(int nr, volatile void * addr);
1918 +____atomic_test_and_clear_bit_mask(unsigned int mask, volatile unsigned char *p)
1920 + unsigned long flags;
1923 + local_irq_save(flags);
1926 + local_irq_restore(flags);
1928 + return res & mask;
1931 -static inline void __clear_bit(int nr, volatile void *addr)
1933 +____atomic_test_and_change_bit_mask(unsigned int mask, volatile unsigned char *p)
1935 - ((unsigned char *) addr)[nr >> 3] &= ~(1U << (nr & 7));
1936 + unsigned long flags;
1939 + local_irq_save(flags);
1942 + local_irq_restore(flags);
1944 + return res & mask;
1947 -extern void change_bit(int nr, volatile void * addr);
1949 + * Now the non-atomic variants. We let the compiler handle all optimisations
1952 +static inline void ____nonatomic_set_bit(int nr, volatile void *p)
1954 + ((unsigned char *) p)[nr >> 3] |= (1U << (nr & 7));
1957 -static inline void __change_bit(int nr, volatile void *addr)
1958 +static inline void ____nonatomic_clear_bit(int nr, volatile void *p)
1960 - ((unsigned char *) addr)[nr >> 3] ^= (1U << (nr & 7));
1961 + ((unsigned char *) p)[nr >> 3] &= ~(1U << (nr & 7));
1964 -extern int test_and_set_bit(int nr, volatile void * addr);
1965 +static inline void ____nonatomic_change_bit(int nr, volatile void *p)
1967 + ((unsigned char *) p)[nr >> 3] ^= (1U << (nr & 7));
1970 -static inline int __test_and_set_bit(int nr, volatile void *addr)
1971 +static inline int ____nonatomic_test_and_set_bit(int nr, volatile void *p)
1973 unsigned int mask = 1 << (nr & 7);
1974 unsigned int oldval;
1976 - oldval = ((unsigned char *) addr)[nr >> 3];
1977 - ((unsigned char *) addr)[nr >> 3] = oldval | mask;
1978 + oldval = ((unsigned char *) p)[nr >> 3];
1979 + ((unsigned char *) p)[nr >> 3] = oldval | mask;
1980 return oldval & mask;
1983 -extern int test_and_clear_bit(int nr, volatile void * addr);
1985 -static inline int __test_and_clear_bit(int nr, volatile void *addr)
1986 +static inline int ____nonatomic_test_and_clear_bit(int nr, volatile void *p)
1988 unsigned int mask = 1 << (nr & 7);
1989 unsigned int oldval;
1991 - oldval = ((unsigned char *) addr)[nr >> 3];
1992 - ((unsigned char *) addr)[nr >> 3] = oldval & ~mask;
1993 + oldval = ((unsigned char *) p)[nr >> 3];
1994 + ((unsigned char *) p)[nr >> 3] = oldval & ~mask;
1995 return oldval & mask;
1998 -extern int test_and_change_bit(int nr, volatile void * addr);
2000 -static inline int __test_and_change_bit(int nr, volatile void *addr)
2001 +static inline int ____nonatomic_test_and_change_bit(int nr, volatile void *p)
2003 unsigned int mask = 1 << (nr & 7);
2004 unsigned int oldval;
2006 - oldval = ((unsigned char *) addr)[nr >> 3];
2007 - ((unsigned char *) addr)[nr >> 3] = oldval ^ mask;
2008 + oldval = ((unsigned char *) p)[nr >> 3];
2009 + ((unsigned char *) p)[nr >> 3] = oldval ^ mask;
2010 return oldval & mask;
2013 -extern int find_first_zero_bit(void * addr, unsigned size);
2014 -extern int find_next_zero_bit(void * addr, int size, int offset);
2017 * This routine doesn't need to be atomic.
2019 -static inline int test_bit(int nr, const void * addr)
2020 +static inline int ____test_bit(int nr, const void * p)
2022 - return (((unsigned char *) addr)[nr >> 3] >> (nr & 7)) & 1;
2023 + return (((volatile unsigned char *) p)[nr >> 3] >> (nr & 7)) & 1;
2027 + * A note about Endian-ness.
2028 + * -------------------------
2030 + * When the ARM is put into big endian mode via CR15, the processor
2031 + * merely swaps the order of bytes within words, thus:
2033 + * ------------ physical data bus bits -----------
2034 + * D31 ... D24 D23 ... D16 D15 ... D8 D7 ... D0
2035 + * little byte 3 byte 2 byte 1 byte 0
2036 + * big byte 0 byte 1 byte 2 byte 3
2038 + * This means that reading a 32-bit word at address 0 returns the same
2039 + * value irrespective of the endian mode bit.
2041 + * Peripheral devices should be connected with the data bus reversed in
2042 + * "Big Endian" mode. ARM Application Note 61 is applicable, and is
2043 + * available from http://www.arm.com/.
2045 + * The following assumes that the data bus connectivity for big endian
2046 + * mode has been followed.
2048 + * Note that bit 0 is defined to be 32-bit word bit 0, not byte 0 bit 0.
2052 + * Little endian assembly bitops. nr = 0 -> byte 0 bit 0.
2054 +extern void _set_bit_le(int nr, volatile void * p);
2055 +extern void _clear_bit_le(int nr, volatile void * p);
2056 +extern void _change_bit_le(int nr, volatile void * p);
2057 +extern int _test_and_set_bit_le(int nr, volatile void * p);
2058 +extern int _test_and_clear_bit_le(int nr, volatile void * p);
2059 +extern int _test_and_change_bit_le(int nr, volatile void * p);
2060 +extern int _find_first_zero_bit_le(void * p, unsigned size);
2061 +extern int _find_next_zero_bit_le(void * p, int size, int offset);
2064 + * Big endian assembly bitops. nr = 0 -> byte 3 bit 0.
2066 +extern void _set_bit_be(int nr, volatile void * p);
2067 +extern void _clear_bit_be(int nr, volatile void * p);
2068 +extern void _change_bit_be(int nr, volatile void * p);
2069 +extern int _test_and_set_bit_be(int nr, volatile void * p);
2070 +extern int _test_and_clear_bit_be(int nr, volatile void * p);
2071 +extern int _test_and_change_bit_be(int nr, volatile void * p);
2072 +extern int _find_first_zero_bit_be(void * p, unsigned size);
2073 +extern int _find_next_zero_bit_be(void * p, int size, int offset);
2077 + * The __* form of bitops are non-atomic and may be reordered.
2079 +#define ATOMIC_BITOP_LE(name,nr,p) \
2080 + (__builtin_constant_p(nr) ? \
2081 + ____atomic_##name##_mask(1 << ((nr) & 7), \
2082 + ((unsigned char *)(p)) + ((nr) >> 3)) : \
2083 + _##name##_le(nr,p))
2085 +#define ATOMIC_BITOP_BE(name,nr,p) \
2086 + (__builtin_constant_p(nr) ? \
2087 + ____atomic_##name##_mask(1 << ((nr) & 7), \
2088 + ((unsigned char *)(p)) + (((nr) >> 3) ^ 3)) : \
2089 + _##name##_be(nr,p))
2091 +#define NONATOMIC_BITOP_LE(name,nr,p) \
2092 + (____nonatomic_##name(nr, p))
2094 +#define NONATOMIC_BITOP_BE(name,nr,p) \
2095 + (____nonatomic_##name(nr ^ 0x18, p))
2099 + * These are the little endian, atomic definitions.
2101 +#define set_bit(nr,p) ATOMIC_BITOP_LE(set_bit,nr,p)
2102 +#define clear_bit(nr,p) ATOMIC_BITOP_LE(clear_bit,nr,p)
2103 +#define change_bit(nr,p) ATOMIC_BITOP_LE(change_bit,nr,p)
2104 +#define test_and_set_bit(nr,p) ATOMIC_BITOP_LE(test_and_set_bit,nr,p)
2105 +#define test_and_clear_bit(nr,p) ATOMIC_BITOP_LE(test_and_clear_bit,nr,p)
2106 +#define test_and_change_bit(nr,p) ATOMIC_BITOP_LE(test_and_change_bit,nr,p)
2107 +#define test_bit(nr,p) ____test_bit(nr,p)
2108 +#define find_first_zero_bit(p,sz) _find_first_zero_bit_le(p,sz)
2109 +#define find_next_zero_bit(p,sz,off) _find_next_zero_bit_le(p,sz,off)
2112 + * These are the little endian, non-atomic definitions.
2114 +#define __set_bit(nr,p) NONATOMIC_BITOP_LE(set_bit,nr,p)
2115 +#define __clear_bit(nr,p) NONATOMIC_BITOP_LE(clear_bit,nr,p)
2116 +#define __change_bit(nr,p) NONATOMIC_BITOP_LE(change_bit,nr,p)
2117 +#define __test_and_set_bit(nr,p) NONATOMIC_BITOP_LE(test_and_set_bit,nr,p)
2118 +#define __test_and_clear_bit(nr,p) NONATOMIC_BITOP_LE(test_and_clear_bit,nr,p)
2119 +#define __test_and_change_bit(nr,p) NONATOMIC_BITOP_LE(test_and_change_bit,nr,p)
2120 +#define __test_bit(nr,p) ____test_bit(nr,p)
2125 + * These are the big endian, atomic definitions.
2127 +#define set_bit(nr,p) ATOMIC_BITOP_BE(set_bit,nr,p)
2128 +#define clear_bit(nr,p) ATOMIC_BITOP_BE(clear_bit,nr,p)
2129 +#define change_bit(nr,p) ATOMIC_BITOP_BE(change_bit,nr,p)
2130 +#define test_and_set_bit(nr,p) ATOMIC_BITOP_BE(test_and_set_bit,nr,p)
2131 +#define test_and_clear_bit(nr,p) ATOMIC_BITOP_BE(test_and_clear_bit,nr,p)
2132 +#define test_and_change_bit(nr,p) ATOMIC_BITOP_BE(test_and_change_bit,nr,p)
2133 +#define test_bit(nr,p) ____test_bit((nr) ^ 0x18, p)
2134 +#define find_first_zero_bit(p,sz) _find_first_zero_bit_be(p,sz)
2135 +#define find_next_zero_bit(p,sz,off) _find_next_zero_bit_be(p,sz,off)
2138 + * These are the big endian, non-atomic definitions.
2140 +#define __set_bit(nr,p) NONATOMIC_BITOP_BE(set_bit,nr,p)
2141 +#define __clear_bit(nr,p) NONATOMIC_BITOP_BE(clear_bit,nr,p)
2142 +#define __change_bit(nr,p) NONATOMIC_BITOP_BE(change_bit,nr,p)
2143 +#define __test_and_set_bit(nr,p) NONATOMIC_BITOP_BE(test_and_set_bit,nr,p)
2144 +#define __test_and_clear_bit(nr,p) NONATOMIC_BITOP_BE(test_and_clear_bit,nr,p)
2145 +#define __test_and_change_bit(nr,p) NONATOMIC_BITOP_BE(test_and_change_bit,nr,p)
2146 +#define __test_bit(nr,p) ____test_bit((nr) ^ 0x18, p)
2151 * ffz = Find First Zero in word. Undefined if no zero exists,
2152 * so code should check against ~0UL first..
2154 @@ -110,6 +302,29 @@
2158 + * ffz = Find First Zero in word. Undefined if no zero exists,
2159 + * so code should check against ~0UL first..
2161 +static inline unsigned long __ffs(unsigned long word)
2166 + if (word & 0x0000ffff) { k -= 16; word <<= 16; }
2167 + if (word & 0x00ff0000) { k -= 8; word <<= 8; }
2168 + if (word & 0x0f000000) { k -= 4; word <<= 4; }
2169 + if (word & 0x30000000) { k -= 2; word <<= 2; }
2170 + if (word & 0x40000000) { k -= 1; }
2175 + * fls: find last bit set.
2178 +#define fls(x) generic_fls(x)
2181 * ffs: find first bit set. This is defined the same way as
2182 * the libc and compiler builtin ffs routines, therefore
2183 * differs in spirit from the above ffz (man ffs).
2184 @@ -118,6 +333,22 @@
2185 #define ffs(x) generic_ffs(x)
2188 + * Find first bit set in a 168-bit bitmap, where the first
2189 + * 128 bits are unlikely to be set.
2191 +static inline int sched_find_first_bit(unsigned long *b)
2196 + for (off = 0; v = b[off], off < 4; off++) {
2200 + return __ffs(v) + off * 32;
2204 * hweightN: returns the hamming weight (i.e. the number
2205 * of bits set) of a N-bit word
2207 @@ -126,18 +357,25 @@
2208 #define hweight16(x) generic_hweight16(x)
2209 #define hweight8(x) generic_hweight8(x)
2211 -#define ext2_set_bit test_and_set_bit
2212 -#define ext2_clear_bit test_and_clear_bit
2213 -#define ext2_test_bit test_bit
2214 -#define ext2_find_first_zero_bit find_first_zero_bit
2215 -#define ext2_find_next_zero_bit find_next_zero_bit
2217 -/* Bitmap functions for the minix filesystem. */
2218 -#define minix_test_and_set_bit(nr,addr) test_and_set_bit(nr,addr)
2219 -#define minix_set_bit(nr,addr) set_bit(nr,addr)
2220 -#define minix_test_and_clear_bit(nr,addr) test_and_clear_bit(nr,addr)
2221 -#define minix_test_bit(nr,addr) test_bit(nr,addr)
2222 -#define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size)
2224 + * Ext2 is defined to use little-endian byte ordering.
2225 + * These do not need to be atomic.
2227 +#define ext2_set_bit(nr,p) NONATOMIC_BITOP_LE(test_and_set_bit,nr,p)
2228 +#define ext2_clear_bit(nr,p) NONATOMIC_BITOP_LE(test_and_clear_bit,nr,p)
2229 +#define ext2_test_bit(nr,p) __test_bit(nr,p)
2230 +#define ext2_find_first_zero_bit(p,sz) _find_first_zero_bit_le(p,sz)
2231 +#define ext2_find_next_zero_bit(p,sz,off) _find_next_zero_bit_le(p,sz,off)
2234 + * Minix is defined to use little-endian byte ordering.
2235 + * These do not need to be atomic.
2237 +#define minix_set_bit(nr,p) NONATOMIC_BITOP_LE(set_bit,nr,p)
2238 +#define minix_test_bit(nr,p) __test_bit(nr,p)
2239 +#define minix_test_and_set_bit(nr,p) NONATOMIC_BITOP_LE(test_and_set_bit,nr,p)
2240 +#define minix_test_and_clear_bit(nr,p) NONATOMIC_BITOP_LE(test_and_clear_bit,nr,p)
2241 +#define minix_find_first_zero_bit(p,sz) _find_first_zero_bit_le(p,sz)
2243 #endif /* __KERNEL__ */
2245 diff -urN linux-2.4.24.org/include/asm-cris/bitops.h linux-2.4.24/include/asm-cris/bitops.h
2246 --- linux-2.4.24.org/include/asm-cris/bitops.h 2004-02-04 20:48:26.679494929 +0100
2247 +++ linux-2.4.24/include/asm-cris/bitops.h 2004-02-04 20:52:53.595977130 +0100
2249 /* We use generic_ffs so get it; include guards resolve the possible
2250 mutually inclusion. */
2251 #include <linux/bitops.h>
2252 +#include <linux/compiler.h>
2255 * Some hacks to defeat gcc over-optimizations..
2257 #define set_bit(nr, addr) (void)test_and_set_bit(nr, addr)
2258 #define __set_bit(nr, addr) (void)__test_and_set_bit(nr, addr)
2260 +#define __set_bit(nr, addr) (void)__test_and_set_bit(nr, addr)
2263 * clear_bit - Clears a bit in memory
2266 #define clear_bit(nr, addr) (void)test_and_clear_bit(nr, addr)
2267 #define __clear_bit(nr, addr) (void)__test_and_clear_bit(nr, addr)
2269 +#define __clear_bit(nr, addr) (void)__test_and_clear_bit(nr, addr)
2272 * change_bit - Toggle a bit in memory
2273 * @nr: Bit to change
2275 * It also implies a memory barrier.
2278 -extern __inline__ int test_and_set_bit(int nr, void *addr)
2279 +extern inline int test_and_set_bit(int nr, void *addr)
2281 unsigned int mask, retval;
2282 unsigned long flags;
2283 @@ -119,6 +124,18 @@
2287 +extern inline int __test_and_set_bit(int nr, void *addr)
2289 + unsigned int mask, retval;
2290 + unsigned int *adr = (unsigned int *)addr;
2293 + mask = 1 << (nr & 0x1f);
2294 + retval = (mask & *adr) != 0;
2300 * clear_bit() doesn't provide any barrier for the compiler.
2303 * It also implies a memory barrier.
2306 -extern __inline__ int test_and_clear_bit(int nr, void *addr)
2307 +extern inline int test_and_clear_bit(int nr, void *addr)
2309 unsigned int mask, retval;
2310 unsigned long flags;
2312 * but actually fail. You must protect multiple accesses with a lock.
2315 -extern __inline__ int __test_and_clear_bit(int nr, void *addr)
2316 +extern inline int __test_and_clear_bit(int nr, void *addr)
2318 unsigned int mask, retval;
2319 unsigned int *adr = (unsigned int *)addr;
2321 * It also implies a memory barrier.
2324 -extern __inline__ int test_and_change_bit(int nr, void *addr)
2325 +extern inline int test_and_change_bit(int nr, void *addr)
2327 unsigned int mask, retval;
2328 unsigned long flags;
2331 /* WARNING: non atomic and it can be reordered! */
2333 -extern __inline__ int __test_and_change_bit(int nr, void *addr)
2334 +extern inline int __test_and_change_bit(int nr, void *addr)
2336 unsigned int mask, retval;
2337 unsigned int *adr = (unsigned int *)addr;
2339 * This routine doesn't need to be atomic.
2342 -extern __inline__ int test_bit(int nr, const void *addr)
2343 +extern inline int test_bit(int nr, const void *addr)
2346 unsigned int *adr = (unsigned int *)addr;
2348 * number. They differ in that the first function also inverts all bits
2351 -extern __inline__ unsigned long cris_swapnwbrlz(unsigned long w)
2352 +extern inline unsigned long cris_swapnwbrlz(unsigned long w)
2354 /* Let's just say we return the result in the same register as the
2355 input. Saying we clobber the input but can return the result
2360 -extern __inline__ unsigned long cris_swapwbrlz(unsigned long w)
2361 +extern inline unsigned long cris_swapwbrlz(unsigned long w)
2364 __asm__ ("swapwbr %0 \n\t"
2366 * ffz = Find First Zero in word. Undefined if no zero exists,
2367 * so code should check against ~0UL first..
2369 -extern __inline__ unsigned long ffz(unsigned long w)
2370 +extern inline unsigned long ffz(unsigned long w)
2372 /* The generic_ffs function is used to avoid the asm when the
2373 argument is a constant. */
2375 * Somewhat like ffz but the equivalent of generic_ffs: in contrast to
2376 * ffz we return the first one-bit *plus one*.
2378 -extern __inline__ unsigned long kernel_ffs(unsigned long w)
2379 +extern inline unsigned long kernel_ffs(unsigned long w)
2381 /* The generic_ffs function is used to avoid the asm when the
2382 argument is a constant. */
2384 * @offset: The bitnumber to start searching at
2385 * @size: The maximum size to search
2387 -extern __inline__ int find_next_zero_bit (void * addr, int size, int offset)
2388 +extern inline int find_next_zero_bit (void * addr, int size, int offset)
2390 unsigned long *p = ((unsigned long *) addr) + (offset >> 5);
2391 unsigned long result = offset & ~31UL;
2392 @@ -375,7 +392,45 @@
2393 #define minix_test_bit(nr,addr) test_bit(nr,addr)
2394 #define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size)
2396 -#endif /* __KERNEL__ */
2398 +/* TODO: see below */
2399 +#define sched_find_first_zero_bit(addr) find_first_zero_bit(addr, 168)
2402 +/* TODO: left out pending where to put it.. (there are .h dependencies) */
2405 + * Every architecture must define this function. It's the fastest
2406 + * way of searching a 168-bit bitmap where the first 128 bits are
2407 + * unlikely to be set. It's guaranteed that at least one of the 168
2408 + * bits is cleared.
2411 +#if MAX_RT_PRIO != 128 || MAX_PRIO != 168
2412 +# error update this function.
2415 +#define MAX_RT_PRIO 128
2416 +#define MAX_PRIO 168
2419 +static inline int sched_find_first_zero_bit(char *bitmap)
2421 + unsigned int *b = (unsigned int *)bitmap;
2424 + rt = b[0] & b[1] & b[2] & b[3];
2425 + if (unlikely(rt != 0xffffffff))
2426 + return find_first_zero_bit(bitmap, MAX_RT_PRIO);
2429 + return ffz(b[4]) + MAX_RT_PRIO;
2430 + return ffz(b[5]) + 32 + MAX_RT_PRIO;
2436 +#endif /* __KERNEL__ */
2438 #endif /* _CRIS_BITOPS_H */
2439 diff -urN linux-2.4.24.org/include/asm-generic/bitops.h linux-2.4.24/include/asm-generic/bitops.h
2440 --- linux-2.4.24.org/include/asm-generic/bitops.h 2004-02-04 20:47:40.855026441 +0100
2441 +++ linux-2.4.24/include/asm-generic/bitops.h 2004-02-04 20:52:53.630969851 +0100
2443 return ((mask & *addr) != 0);
2447 + * fls: find last bit set.
2450 +#define fls(x) generic_fls(x)
2455 diff -urN linux-2.4.24.org/include/asm-i386/bitops.h linux-2.4.24/include/asm-i386/bitops.h
2456 --- linux-2.4.24.org/include/asm-i386/bitops.h 2004-02-04 20:47:40.983999614 +0100
2457 +++ linux-2.4.24/include/asm-i386/bitops.h 2004-02-04 20:52:53.655964653 +0100
2461 #include <linux/config.h>
2462 +#include <linux/compiler.h>
2465 * These have to be done with inline assembly: that way the bit-setting
2471 +static __inline__ void __clear_bit(int nr, volatile void * addr)
2473 + __asm__ __volatile__(
2478 #define smp_mb__before_clear_bit() barrier()
2479 #define smp_mb__after_clear_bit() barrier()
2481 @@ -284,6 +293,34 @@
2485 + * find_first_bit - find the first set bit in a memory region
2486 + * @addr: The address to start the search at
2487 + * @size: The maximum size to search
2489 + * Returns the bit-number of the first set bit, not the number of the byte
2490 + * containing a bit.
2492 +static __inline__ int find_first_bit(void * addr, unsigned size)
2497 + /* This looks at memory. Mark it volatile to tell gcc not to move it around */
2498 + __asm__ __volatile__(
2499 + "xorl %%eax,%%eax\n\t"
2502 + "leal -4(%%edi),%%edi\n\t"
2503 + "bsfl (%%edi),%%eax\n"
2504 + "1:\tsubl %%ebx,%%edi\n\t"
2505 + "shll $3,%%edi\n\t"
2506 + "addl %%edi,%%eax"
2507 + :"=a" (res), "=&c" (d0), "=&D" (d1)
2508 + :"1" ((size + 31) >> 5), "2" (addr), "b" (addr));
2513 * find_next_zero_bit - find the first zero bit in a memory region
2514 * @addr: The address to base the search on
2515 * @offset: The bitnumber to start searching at
2520 - * Look for zero in first byte
2521 + * Look for zero in the first 32 bits.
2523 __asm__("bsfl %1,%0\n\t"
2525 @@ -317,6 +354,39 @@
2529 + * find_next_bit - find the first set bit in a memory region
2530 + * @addr: The address to base the search on
2531 + * @offset: The bitnumber to start searching at
2532 + * @size: The maximum size to search
2534 +static __inline__ int find_next_bit (void * addr, int size, int offset)
2536 + unsigned long * p = ((unsigned long *) addr) + (offset >> 5);
2537 + int set = 0, bit = offset & 31, res;
2541 + * Look for nonzero in the first 32 bits:
2543 + __asm__("bsfl %1,%0\n\t"
2548 + : "r" (*p >> bit));
2549 + if (set < (32 - bit))
2550 + return set + offset;
2555 + * No set bit yet, search remaining full words for a bit
2557 + res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr));
2558 + return (offset + set + res);
2562 * ffz - find first zero in word.
2563 * @word: The word to search
2565 @@ -330,8 +400,41 @@
2570 + * __ffs - find first bit in word.
2571 + * @word: The word to search
2572 + * Undefined if no bit exists, so code should check against 0 first.
2574 +static __inline__ unsigned long __ffs(unsigned long word)
2576 + __asm__("bsfl %1,%0"
2581 +#define fls(x) generic_fls(x)
2586 + * Every architecture must define this function. It's the fastest
2587 + * way of searching a 140-bit bitmap where the first 100 bits are
2588 + * unlikely to be set. It's guaranteed that at least one of the 140
2589 + * bits is cleared.
2591 +static inline int sched_find_first_bit(unsigned long *b)
2593 + if (unlikely(b[0]))
2594 + return __ffs(b[0]);
2595 + if (unlikely(b[1]))
2596 + return __ffs(b[1]) + 32;
2597 + if (unlikely(b[2]))
2598 + return __ffs(b[2]) + 64;
2600 + return __ffs(b[3]) + 96;
2601 + return __ffs(b[4]) + 128;
2605 * ffs - find first bit set
2606 * @x: the word to search
2607 diff -urN linux-2.4.24.org/include/asm-i386/mmu_context.h linux-2.4.24/include/asm-i386/mmu_context.h
2608 --- linux-2.4.24.org/include/asm-i386/mmu_context.h 2004-02-04 20:47:41.355922254 +0100
2609 +++ linux-2.4.24/include/asm-i386/mmu_context.h 2004-02-04 20:57:00.389646201 +0100
2612 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu)
2614 - if (prev != next) {
2615 + if (likely(prev != next)) {
2616 /* stop flush ipis for the previous mm */
2617 clear_bit(cpu, &prev->cpu_vm_mask);
2620 /* load_LDT, if either the previous or next thread
2621 * has a non-default LDT.
2623 - if (next->context.size+prev->context.size)
2624 + if (unlikely(next->context.size+prev->context.size))
2625 load_LDT(&next->context);
2628 diff -urN linux-2.4.24.org/include/asm-i386/processor.h linux-2.4.24/include/asm-i386/processor.h
2629 --- linux-2.4.24.org/include/asm-i386/processor.h 2004-02-04 20:47:40.967003150 +0100
2630 +++ linux-2.4.24/include/asm-i386/processor.h 2004-02-04 20:52:53.702954879 +0100
2633 #define cpu_relax() rep_nop()
2635 +#define ARCH_HAS_SMP_BALANCE
2637 /* Prefetch instructions for Pentium III and AMD Athlon */
2638 #if defined(CONFIG_MPENTIUMIII) || defined (CONFIG_MPENTIUM4)
2640 diff -urN linux-2.4.24.org/include/asm-i386/smp_balance.h linux-2.4.24/include/asm-i386/smp_balance.h
2641 --- linux-2.4.24.org/include/asm-i386/smp_balance.h 1970-01-01 01:00:00.000000000 +0100
2642 +++ linux-2.4.24/include/asm-i386/smp_balance.h 2004-02-04 20:52:53.705954255 +0100
2644 +#ifndef _ASM_SMP_BALANCE_H
2645 +#define _ASM_SMP_BALANCE_H
2648 + * We have an architecture-specific SMP load balancer to improve
2649 + * scheduling behavior on hyperthreaded CPUs. Since only P4s have
2650 + * HT, maybe this should be conditional on CONFIG_MPENTIUM4...
2655 + * Find any idle processor package (i.e. both virtual processors are idle)
2657 +static inline int find_idle_package(int this_cpu)
2661 + this_cpu = cpu_number_map(this_cpu);
2663 + for (i = (this_cpu + 1) % smp_num_cpus;
2665 + i = (i + 1) % smp_num_cpus) {
2666 + int physical = cpu_logical_map(i);
2667 + int sibling = cpu_sibling_map[physical];
2669 + if (idle_cpu(physical) && idle_cpu(sibling))
2672 + return -1; /* not found */
2675 +static inline int arch_reschedule_idle_override(task_t * p, int idle)
2677 + if (unlikely(smp_num_siblings > 1) && !idle_cpu(cpu_sibling_map[idle])) {
2678 + int true_idle = find_idle_package(idle);
2679 + if (true_idle >= 0) {
2680 + if (likely(p->cpus_allowed & (1UL << true_idle)))
2683 + true_idle = cpu_sibling_map[true_idle];
2684 + if (p->cpus_allowed & (1UL << true_idle))
2693 +static inline int arch_load_balance(int this_cpu, int idle)
2695 + /* Special hack for hyperthreading */
2696 + if (unlikely(smp_num_siblings > 1 && idle == 2 && !idle_cpu(cpu_sibling_map[this_cpu]))) {
2698 + struct runqueue *rq_target;
2700 + if ((found = find_idle_package(this_cpu)) >= 0 ) {
2701 + rq_target = cpu_rq(found);
2702 + resched_task(rq_target->idle);
2709 +#endif /* _ASM_SMP_BALANCE_H */
2710 diff -urN linux-2.4.24.org/include/asm-i386/smp.h linux-2.4.24/include/asm-i386/smp.h
2711 --- linux-2.4.24.org/include/asm-i386/smp.h 2004-02-04 20:47:41.153964261 +0100
2712 +++ linux-2.4.24/include/asm-i386/smp.h 2004-02-04 20:52:53.733948432 +0100
2714 extern void smp_flush_tlb(void);
2715 extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
2716 extern void fastcall smp_send_reschedule(int cpu);
2717 +extern void smp_send_reschedule_all(void);
2718 extern void smp_invalidate_rcv(void); /* Process an NMI */
2719 extern void (*mtrr_hook) (void);
2720 extern void zap_low_mappings (void);
2722 * so this is correct in the x86 case.
2725 -#define smp_processor_id() (current->processor)
2726 +#define smp_processor_id() (current->cpu)
2728 static __inline int hard_smp_processor_id(void)
2732 #define NO_PROC_ID 0xFF /* No processor magic marker */
2735 - * This magic constant controls our willingness to transfer
2736 - * a process across CPUs. Such a transfer incurs misses on the L1
2737 - * cache, and on a P6 or P5 with multiple L2 caches L2 hits. My
2738 - * gut feeling is this will vary by board in value. For a board
2739 - * with separate L2 cache it probably depends also on the RSS, and
2740 - * for a board with shared L2 cache it ought to decay fast as other
2741 - * processes are run.
2744 -#define PROC_CHANGE_PENALTY 15 /* Schedule penalty */
2748 diff -urN linux-2.4.24.org/include/asm-i386/system.h linux-2.4.24/include/asm-i386/system.h
2749 --- linux-2.4.24.org/include/asm-i386/system.h 2004-02-04 20:47:40.963003981 +0100
2750 +++ linux-2.4.24/include/asm-i386/system.h 2004-02-04 20:52:53.759943026 +0100
2752 struct task_struct; /* one of the stranger aspects of C forward declarations.. */
2753 extern void FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
2755 -#define prepare_to_switch() do { } while(0)
2756 #define switch_to(prev,next,last) do { \
2757 asm volatile("pushl %%esi\n\t" \
2760 "movl %%esp,%0\n\t" /* save ESP */ \
2761 - "movl %3,%%esp\n\t" /* restore ESP */ \
2762 + "movl %2,%%esp\n\t" /* restore ESP */ \
2763 "movl $1f,%1\n\t" /* save EIP */ \
2764 - "pushl %4\n\t" /* restore EIP */ \
2765 + "pushl %3\n\t" /* restore EIP */ \
2766 "jmp __switch_to\n" \
2771 - :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \
2773 + :"=m" (prev->thread.esp),"=m" (prev->thread.eip) \
2774 :"m" (next->thread.esp),"m" (next->thread.eip), \
2775 - "a" (prev), "d" (next), \
2777 + "a" (prev), "d" (next)); \
2780 #define _set_base(addr,base) do { unsigned long __pr; \
2781 diff -urN linux-2.4.24.org/include/asm-ia64/bitops.h linux-2.4.24/include/asm-ia64/bitops.h
2782 --- linux-2.4.24.org/include/asm-ia64/bitops.h 2004-02-04 20:48:16.659579072 +0100
2783 +++ linux-2.4.24/include/asm-ia64/bitops.h 2004-02-04 20:52:53.793935955 +0100
2786 * Copyright (C) 1998-2003 Hewlett-Packard Co
2787 * David Mosberger-Tang <davidm@hpl.hp.com>
2789 + * 02/06/02 find_next_bit() and find_first_bit() added from Erich Focht's ia64 O(1)
2793 #include <linux/types.h>
2798 + * __clear_bit - Clears a bit in memory (non-atomic version)
2800 +static __inline__ void
2801 +__clear_bit (int nr, volatile void *addr)
2803 + volatile __u32 *p = (__u32 *) addr + (nr >> 5);
2804 + __u32 m = 1 << (nr & 31);
2809 * change_bit - Toggle a bit in memory
2811 * @addr: Address to start counting from
2812 @@ -266,12 +280,11 @@
2816 - * ffz - find the first zero bit in a memory region
2817 - * @x: The address to start the search at
2818 + * ffz - find the first zero bit in a long word
2819 + * @x: The long word to find the bit in
2821 - * Returns the bit-number (0..63) of the first (least significant) zero bit, not
2822 - * the number of the byte containing a bit. Undefined if no zero exists, so
2823 - * code should check against ~0UL first...
2824 + * Returns the bit-number (0..63) of the first (least significant) zero bit. Undefined if
2825 + * no zero exists, so code should check against ~0UL first...
2827 static inline unsigned long
2828 ffz (unsigned long x)
2829 @@ -297,6 +310,21 @@
2834 + * __ffs - find first bit in word.
2835 + * @x: The word to search
2837 + * Undefined if no bit exists, so code should check against 0 first.
2839 +static __inline__ unsigned long
2840 +__ffs (unsigned long x)
2842 + unsigned long result;
2844 + __asm__ ("popcnt %0=%1" : "=r" (result) : "r" ((x - 1) & ~x));
2851 @@ -313,6 +341,12 @@
2852 return exp - 0xffff;
2858 + return ia64_fls((unsigned int) x);
2862 * ffs: find first bit set. This is defined the same way as the libc and compiler builtin
2863 * ffs routines, therefore differs in spirit from the above ffz (man ffs): it operates on
2864 @@ -385,8 +419,53 @@
2866 #define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0)
2869 + * Find next bit in a bitmap reasonably efficiently..
2872 +find_next_bit (void *addr, unsigned long size, unsigned long offset)
2874 + unsigned long *p = ((unsigned long *) addr) + (offset >> 6);
2875 + unsigned long result = offset & ~63UL;
2876 + unsigned long tmp;
2878 + if (offset >= size)
2884 + tmp &= ~0UL << offset;
2888 + goto found_middle;
2892 + while (size & ~63UL) {
2893 + if ((tmp = *(p++)))
2894 + goto found_middle;
2902 + tmp &= ~0UL >> (64-size);
2903 + if (tmp == 0UL) /* Are any bits set? */
2904 + return result + size; /* Nope. */
2906 + return result + __ffs(tmp);
2909 +#define find_first_bit(addr, size) find_next_bit((addr), (size), 0)
2913 +#define __clear_bit(nr, addr) clear_bit(nr, addr)
2915 #define ext2_set_bit test_and_set_bit
2916 #define ext2_clear_bit test_and_clear_bit
2917 #define ext2_test_bit test_bit
2918 @@ -400,6 +479,16 @@
2919 #define minix_test_bit(nr,addr) test_bit(nr,addr)
2920 #define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size)
2923 +sched_find_first_bit (unsigned long *b)
2925 + if (unlikely(b[0]))
2926 + return __ffs(b[0]);
2927 + if (unlikely(b[1]))
2928 + return 64 + __ffs(b[1]);
2929 + return __ffs(b[2]) + 128;
2932 #endif /* __KERNEL__ */
2934 #endif /* _ASM_IA64_BITOPS_H */
2935 diff -urN linux-2.4.24.org/include/asm-m68k/bitops.h linux-2.4.24/include/asm-m68k/bitops.h
2936 --- linux-2.4.24.org/include/asm-m68k/bitops.h 2004-02-04 20:47:47.882564706 +0100
2937 +++ linux-2.4.24/include/asm-m68k/bitops.h 2004-02-04 20:52:53.798934916 +0100
2939 (__builtin_constant_p(nr) ? \
2940 __constant_clear_bit(nr, vaddr) : \
2941 __generic_clear_bit(nr, vaddr))
2942 +#define __clear_bit(nr,vaddr) clear_bit(nr,vaddr)
2944 static inline void __constant_clear_bit(int nr, volatile void *vaddr)
2946 @@ -238,6 +239,28 @@
2950 +#define __ffs(x) (ffs(x) - 1)
2954 + * Every architecture must define this function. It's the fastest
2955 + * way of searching a 140-bit bitmap where the first 100 bits are
2956 + * unlikely to be set. It's guaranteed that at least one of the 140
2957 + * bits is cleared.
2959 +static inline int sched_find_first_bit(unsigned long *b)
2961 + if (unlikely(b[0]))
2962 + return __ffs(b[0]);
2963 + if (unlikely(b[1]))
2964 + return __ffs(b[1]) + 32;
2965 + if (unlikely(b[2]))
2966 + return __ffs(b[2]) + 64;
2968 + return __ffs(b[3]) + 96;
2969 + return __ffs(b[4]) + 128;
2974 * hweightN: returns the hamming weight (i.e. the number
2975 diff -urN linux-2.4.24.org/include/asm-mips/bitops.h linux-2.4.24/include/asm-mips/bitops.h
2976 --- linux-2.4.24.org/include/asm-mips/bitops.h 2004-02-04 20:47:43.266524847 +0100
2977 +++ linux-2.4.24/include/asm-mips/bitops.h 2004-02-04 20:52:53.820930341 +0100
2980 #ifdef CONFIG_CPU_HAS_LLSC
2982 +#include <asm/mipsregs.h>
2985 * These functions for MIPS ISA > 1 are interrupt and SMP proof and
2986 * interrupt friendly
2987 @@ -593,21 +595,30 @@
2989 * Undefined if no zero exists, so code should check against ~0UL first.
2991 -static __inline__ unsigned long ffz(unsigned long word)
2992 +extern __inline__ unsigned long ffz(unsigned long word)
2995 + unsigned int __res;
2996 + unsigned int mask = 1;
2999 - s = 16; if (word << 16 != 0) s = 0; b += s; word >>= s;
3000 - s = 8; if (word << 24 != 0) s = 0; b += s; word >>= s;
3001 - s = 4; if (word << 28 != 0) s = 0; b += s; word >>= s;
3002 - s = 2; if (word << 30 != 0) s = 0; b += s; word >>= s;
3003 - s = 1; if (word << 31 != 0) s = 0; b += s;
3005 + ".set\tnoreorder\n\t"
3008 + "1:\tand\t$1,%2,%1\n\t"
3016 + : "=&r" (__res), "=r" (mask)
3017 + : "r" (word), "1" (mask)
3028 diff -urN linux-2.4.24.org/include/asm-mips64/bitops.h linux-2.4.24/include/asm-mips64/bitops.h
3029 --- linux-2.4.24.org/include/asm-mips64/bitops.h 2004-02-04 20:48:21.702530138 +0100
3030 +++ linux-2.4.24/include/asm-mips64/bitops.h 2004-02-04 20:52:53.873919319 +0100
3033 #include <asm/system.h>
3034 #include <asm/sgidefs.h>
3035 +#include <asm/mipsregs.h>
3038 * set_bit - Atomically set a bit in memory
3040 * Note that @nr may be almost arbitrarily large; this function is not
3041 * restricted to acting on a single-word quantity.
3043 -static inline void set_bit(unsigned long nr, volatile void *addr)
3044 +extern __inline__ void
3045 +set_bit(unsigned long nr, volatile void *addr)
3047 unsigned long *m = ((unsigned long *) addr) + (nr >> 6);
3050 * If it's called on the same region of memory simultaneously, the effect
3051 * may be that only one operation succeeds.
3053 -static inline void __set_bit(int nr, volatile void * addr)
3054 +extern __inline__ void __set_bit(int nr, volatile void * addr)
3056 unsigned long * m = ((unsigned long *) addr) + (nr >> 6);
3059 * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
3060 * in order to ensure changes are visible on other processors.
3062 -static inline void clear_bit(unsigned long nr, volatile void *addr)
3063 +extern __inline__ void
3064 +clear_bit(unsigned long nr, volatile void *addr)
3066 unsigned long *m = ((unsigned long *) addr) + (nr >> 6);
3069 * Note that @nr may be almost arbitrarily large; this function is not
3070 * restricted to acting on a single-word quantity.
3072 -static inline void change_bit(unsigned long nr, volatile void *addr)
3073 +extern __inline__ void
3074 +change_bit(unsigned long nr, volatile void *addr)
3076 unsigned long *m = ((unsigned long *) addr) + (nr >> 6);
3079 * If it's called on the same region of memory simultaneously, the effect
3080 * may be that only one operation succeeds.
3082 -static inline void __change_bit(int nr, volatile void * addr)
3083 +extern __inline__ void __change_bit(int nr, volatile void * addr)
3085 unsigned long * m = ((unsigned long *) addr) + (nr >> 6);
3088 * This operation is atomic and cannot be reordered.
3089 * It also implies a memory barrier.
3091 -static inline unsigned long test_and_set_bit(unsigned long nr,
3092 - volatile void *addr)
3093 +extern __inline__ unsigned long
3094 +test_and_set_bit(unsigned long nr, volatile void *addr)
3096 unsigned long *m = ((unsigned long *) addr) + (nr >> 6);
3097 unsigned long temp, res;
3099 * If two examples of this operation race, one can appear to succeed
3100 * but actually fail. You must protect multiple accesses with a lock.
3102 -static inline int __test_and_set_bit(int nr, volatile void *addr)
3103 +extern __inline__ int
3104 +__test_and_set_bit(int nr, volatile void * addr)
3106 unsigned long mask, retval;
3107 long *a = (unsigned long *) addr;
3109 * This operation is atomic and cannot be reordered.
3110 * It also implies a memory barrier.
3112 -static inline unsigned long test_and_clear_bit(unsigned long nr,
3113 - volatile void *addr)
3114 +extern __inline__ unsigned long
3115 +test_and_clear_bit(unsigned long nr, volatile void *addr)
3117 unsigned long *m = ((unsigned long *) addr) + (nr >> 6);
3118 unsigned long temp, res;
3120 * If two examples of this operation race, one can appear to succeed
3121 * but actually fail. You must protect multiple accesses with a lock.
3123 -static inline int __test_and_clear_bit(int nr, volatile void * addr)
3124 +extern __inline__ int
3125 +__test_and_clear_bit(int nr, volatile void * addr)
3127 unsigned long mask, retval;
3128 unsigned long *a = (unsigned long *) addr;
3130 * This operation is atomic and cannot be reordered.
3131 * It also implies a memory barrier.
3133 -static inline unsigned long test_and_change_bit(unsigned long nr,
3134 - volatile void *addr)
3135 +extern __inline__ unsigned long
3136 +test_and_change_bit(unsigned long nr, volatile void *addr)
3138 unsigned long *m = ((unsigned long *) addr) + (nr >> 6);
3139 unsigned long temp, res;
3141 * If two examples of this operation race, one can appear to succeed
3142 * but actually fail. You must protect multiple accesses with a lock.
3144 -static inline int __test_and_change_bit(int nr, volatile void *addr)
3145 +extern __inline__ int
3146 +__test_and_change_bit(int nr, volatile void * addr)
3148 unsigned long mask, retval;
3149 unsigned long *a = (unsigned long *) addr;
3151 * @nr: bit number to test
3152 * @addr: Address to start counting from
3154 -static inline int test_bit(int nr, volatile void * addr)
3155 +extern __inline__ unsigned long
3156 +test_bit(int nr, volatile void * addr)
3158 return 1UL & (((const volatile unsigned long *) addr)[nr >> SZLONG_LOG] >> (nr & SZLONG_MASK));
3160 @@ -313,19 +321,20 @@
3162 * Undefined if no zero exists, so code should check against ~0UL first.
3164 -static __inline__ unsigned long ffz(unsigned long word)
3165 +extern __inline__ unsigned long ffz(unsigned long word)
3171 - s = 32; if (word << 32 != 0) s = 0; b += s; word >>= s;
3172 - s = 16; if (word << 48 != 0) s = 0; b += s; word >>= s;
3173 - s = 8; if (word << 56 != 0) s = 0; b += s; word >>= s;
3174 - s = 4; if (word << 60 != 0) s = 0; b += s; word >>= s;
3175 - s = 2; if (word << 62 != 0) s = 0; b += s; word >>= s;
3176 - s = 1; if (word << 63 != 0) s = 0; b += s;
3178 + if (word & 0x00000000ffffffffUL) { k -= 32; word <<= 32; }
3179 + if (word & 0x0000ffff00000000UL) { k -= 16; word <<= 16; }
3180 + if (word & 0x00ff000000000000UL) { k -= 8; word <<= 8; }
3181 + if (word & 0x0f00000000000000UL) { k -= 4; word <<= 4; }
3182 + if (word & 0x3000000000000000UL) { k -= 2; word <<= 2; }
3183 + if (word & 0x4000000000000000UL) { k -= 1; }
3191 * @offset: The bitnumber to start searching at
3192 * @size: The maximum size to search
3194 -static inline unsigned long find_next_zero_bit(void *addr, unsigned long size,
3195 - unsigned long offset)
3196 +extern __inline__ unsigned long
3197 +find_next_zero_bit(void *addr, unsigned long size, unsigned long offset)
3199 unsigned long *p = ((unsigned long *) addr) + (offset >> SZLONG_LOG);
3200 unsigned long result = offset & ~SZLONG_MASK;
3202 #define hweight16(x) generic_hweight16(x)
3203 #define hweight8(x) generic_hweight8(x)
3205 -static inline int __test_and_set_le_bit(unsigned long nr, void * addr)
3207 +__test_and_set_le_bit(unsigned long nr, void * addr
3209 unsigned char *ADDR = (unsigned char *) addr;
3215 -static inline int __test_and_clear_le_bit(unsigned long nr, void * addr)
3217 +__test_and_clear_le_bit(unsigned long nr, void * addr)
3219 unsigned char *ADDR = (unsigned char *) addr;
3225 -static inline int test_le_bit(unsigned long nr, const void * addr)
3227 +test_le_bit(unsigned long nr, const void * addr)
3229 const unsigned char *ADDR = (const unsigned char *) addr;
3235 -static inline unsigned long find_next_zero_le_bit(void *addr,
3236 +extern inline unsigned long find_next_zero_le_bit(void *addr,
3237 unsigned long size, unsigned long offset)
3239 unsigned int *p = ((unsigned int *) addr) + (offset >> 5);
3240 diff -urN linux-2.4.24.org/include/asm-ppc/bitops.h linux-2.4.24/include/asm-ppc/bitops.h
3241 --- linux-2.4.24.org/include/asm-ppc/bitops.h 2004-02-04 20:47:57.992461840 +0100
3242 +++ linux-2.4.24/include/asm-ppc/bitops.h 2004-02-04 20:52:53.902913289 +0100
3244 #define _PPC_BITOPS_H
3246 #include <linux/config.h>
3247 +#include <linux/compiler.h>
3248 #include <asm/byteorder.h>
3249 #include <asm/atomic.h>
3252 * These used to be if'd out here because using : "cc" as a constraint
3253 * resulted in errors from egcs. Things appear to be OK with gcc-2.95.
3255 -static __inline__ void set_bit(int nr, volatile void * addr)
3256 +static __inline__ void set_bit(int nr, volatile unsigned long * addr)
3259 unsigned long mask = 1 << (nr & 0x1f);
3262 * non-atomic version
3264 -static __inline__ void __set_bit(int nr, volatile void *addr)
3265 +static __inline__ void __set_bit(int nr, volatile unsigned long *addr)
3267 unsigned long mask = 1 << (nr & 0x1f);
3268 unsigned long *p = ((unsigned long *)addr) + (nr >> 5);
3270 #define smp_mb__before_clear_bit() smp_mb()
3271 #define smp_mb__after_clear_bit() smp_mb()
3273 -static __inline__ void clear_bit(int nr, volatile void *addr)
3274 +static __inline__ void clear_bit(int nr, volatile unsigned long *addr)
3277 unsigned long mask = 1 << (nr & 0x1f);
3280 * non-atomic version
3282 -static __inline__ void __clear_bit(int nr, volatile void *addr)
3283 +static __inline__ void __clear_bit(int nr, volatile unsigned long *addr)
3285 unsigned long mask = 1 << (nr & 0x1f);
3286 unsigned long *p = ((unsigned long *)addr) + (nr >> 5);
3291 -static __inline__ void change_bit(int nr, volatile void *addr)
3292 +static __inline__ void change_bit(int nr, volatile unsigned long *addr)
3295 unsigned long mask = 1 << (nr & 0x1f);
3298 * non-atomic version
3300 -static __inline__ void __change_bit(int nr, volatile void *addr)
3301 +static __inline__ void __change_bit(int nr, volatile unsigned long *addr)
3303 unsigned long mask = 1 << (nr & 0x1f);
3304 unsigned long *p = ((unsigned long *)addr) + (nr >> 5);
3307 * test_and_*_bit do imply a memory barrier (?)
3309 -static __inline__ int test_and_set_bit(int nr, volatile void *addr)
3310 +static __inline__ int test_and_set_bit(int nr, volatile unsigned long *addr)
3312 unsigned int old, t;
3313 unsigned int mask = 1 << (nr & 0x1f);
3316 * non-atomic version
3318 -static __inline__ int __test_and_set_bit(int nr, volatile void *addr)
3319 +static __inline__ int __test_and_set_bit(int nr, volatile unsigned long *addr)
3321 unsigned long mask = 1 << (nr & 0x1f);
3322 unsigned long *p = ((unsigned long *)addr) + (nr >> 5);
3324 return (old & mask) != 0;
3327 -static __inline__ int test_and_clear_bit(int nr, volatile void *addr)
3328 +static __inline__ int test_and_clear_bit(int nr, volatile unsigned long *addr)
3330 unsigned int old, t;
3331 unsigned int mask = 1 << (nr & 0x1f);
3334 * non-atomic version
3336 -static __inline__ int __test_and_clear_bit(int nr, volatile void *addr)
3337 +static __inline__ int __test_and_clear_bit(int nr, volatile unsigned long *addr)
3339 unsigned long mask = 1 << (nr & 0x1f);
3340 unsigned long *p = ((unsigned long *)addr) + (nr >> 5);
3342 return (old & mask) != 0;
3345 -static __inline__ int test_and_change_bit(int nr, volatile void *addr)
3346 +static __inline__ int test_and_change_bit(int nr, volatile unsigned long *addr)
3348 unsigned int old, t;
3349 unsigned int mask = 1 << (nr & 0x1f);
3352 * non-atomic version
3354 -static __inline__ int __test_and_change_bit(int nr, volatile void *addr)
3355 +static __inline__ int __test_and_change_bit(int nr, volatile unsigned long *addr)
3357 unsigned long mask = 1 << (nr & 0x1f);
3358 unsigned long *p = ((unsigned long *)addr) + (nr >> 5);
3360 return (old & mask) != 0;
3363 -static __inline__ int test_bit(int nr, __const__ volatile void *addr)
3364 +static __inline__ int test_bit(int nr, __const__ volatile unsigned long *addr)
3366 __const__ unsigned int *p = (__const__ unsigned int *) addr;
3371 /* Return the bit position of the most significant 1 bit in a word */
3372 -static __inline__ int __ilog2(unsigned int x)
3373 +static __inline__ int __ilog2(unsigned long x)
3377 @@ -234,13 +235,18 @@
3381 -static __inline__ int ffz(unsigned int x)
3382 +static __inline__ int ffz(unsigned long x)
3386 return __ilog2(x & -x);
3389 +static inline int __ffs(unsigned long x)
3391 + return __ilog2(x & -x);
3395 * ffs: find first bit set. This is defined the same way as
3396 * the libc and compiler builtin ffs routines, therefore
3397 @@ -252,6 +258,18 @@
3401 + * fls: find last (most-significant) bit set.
3402 + * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
3404 +static __inline__ int fls(unsigned int x)
3408 + asm ("cntlzw %0,%1" : "=r" (lz) : "r" (x));
3413 * hweightN: returns the hamming weight (i.e. the number
3414 * of bits set) of a N-bit word
3416 @@ -261,13 +279,86 @@
3417 #define hweight8(x) generic_hweight8(x)
3420 + * Find the first bit set in a 140-bit bitmap.
3421 + * The first 100 bits are unlikely to be set.
3423 +static inline int sched_find_first_bit(unsigned long *b)
3425 + if (unlikely(b[0]))
3426 + return __ffs(b[0]);
3427 + if (unlikely(b[1]))
3428 + return __ffs(b[1]) + 32;
3429 + if (unlikely(b[2]))
3430 + return __ffs(b[2]) + 64;
3432 + return __ffs(b[3]) + 96;
3433 + return __ffs(b[4]) + 128;
3437 + * find_next_bit - find the next set bit in a memory region
3438 + * @addr: The address to base the search on
3439 + * @offset: The bitnumber to start searching at
3440 + * @size: The maximum size to search
3442 +static __inline__ unsigned long find_next_bit(unsigned long *addr,
3443 + unsigned long size, unsigned long offset)
3445 + unsigned int *p = ((unsigned int *) addr) + (offset >> 5);
3446 + unsigned int result = offset & ~31UL;
3449 + if (offset >= size)
3455 + tmp &= ~0UL << offset;
3459 + goto found_middle;
3463 + while (size >= 32) {
3464 + if ((tmp = *p++) != 0)
3465 + goto found_middle;
3474 + tmp &= ~0UL >> (32 - size);
3475 + if (tmp == 0UL) /* Are any bits set? */
3476 + return result + size; /* Nope. */
3478 + return result + __ffs(tmp);
3482 + * find_first_bit - find the first set bit in a memory region
3483 + * @addr: The address to start the search at
3484 + * @size: The maximum size to search
3486 + * Returns the bit-number of the first set bit, not the number of the byte
3487 + * containing a bit.
3489 +#define find_first_bit(addr, size) \
3490 + find_next_bit((addr), (size), 0)
3493 * This implementation of find_{first,next}_zero_bit was stolen from
3494 * Linus' asm-alpha/bitops.h.
3496 #define find_first_zero_bit(addr, size) \
3497 find_next_zero_bit((addr), (size), 0)
3499 -static __inline__ unsigned long find_next_zero_bit(void * addr,
3500 +static __inline__ unsigned long find_next_zero_bit(unsigned long * addr,
3501 unsigned long size, unsigned long offset)
3503 unsigned int * p = ((unsigned int *) addr) + (offset >> 5);
3508 -#define ext2_set_bit(nr, addr) __test_and_set_bit((nr) ^ 0x18, addr)
3509 -#define ext2_clear_bit(nr, addr) __test_and_clear_bit((nr) ^ 0x18, addr)
3510 +#define ext2_set_bit(nr, addr) __test_and_set_bit((nr) ^ 0x18, (unsigned long *)(addr))
3511 +#define ext2_clear_bit(nr, addr) __test_and_clear_bit((nr) ^ 0x18, (unsigned long *)(addr))
3513 static __inline__ int ext2_test_bit(int nr, __const__ void * addr)
3515 diff -urN linux-2.4.24.org/include/asm-ppc/smp.h linux-2.4.24/include/asm-ppc/smp.h
3516 --- linux-2.4.24.org/include/asm-ppc/smp.h 2004-02-04 20:47:58.116436054 +0100
3517 +++ linux-2.4.24/include/asm-ppc/smp.h 2004-02-04 20:52:53.906912457 +0100
3519 #define cpu_logical_map(cpu) (cpu)
3520 #define cpu_number_map(x) (x)
3522 -#define smp_processor_id() (current->processor)
3523 +#define smp_processor_id() (current->cpu)
3525 extern int smp_hw_index[NR_CPUS];
3526 #define hard_smp_processor_id() (smp_hw_index[smp_processor_id()])
3527 diff -urN linux-2.4.24.org/include/asm-ppc64/bitops.h linux-2.4.24/include/asm-ppc64/bitops.h
3528 --- linux-2.4.24.org/include/asm-ppc64/bitops.h 2004-02-04 20:47:31.682934246 +0100
3529 +++ linux-2.4.24/include/asm-ppc64/bitops.h 2004-02-04 20:52:53.961901020 +0100
3531 #define smp_mb__before_clear_bit() smp_mb()
3532 #define smp_mb__after_clear_bit() smp_mb()
3534 -static __inline__ int test_bit(unsigned long nr, __const__ volatile void *addr)
3535 +static __inline__ int test_bit(unsigned long nr, __const__ volatile unsigned long *addr)
3537 return (1UL & (((__const__ long *) addr)[nr >> 6] >> (nr & 63)));
3540 -static __inline__ void set_bit(unsigned long nr, volatile void *addr)
3541 +static __inline__ void set_bit(unsigned long nr, volatile unsigned long *addr)
3544 unsigned long mask = 1UL << (nr & 0x3f);
3549 -static __inline__ void clear_bit(unsigned long nr, volatile void *addr)
3550 +static __inline__ void clear_bit(unsigned long nr, volatile unsigned long *addr)
3553 unsigned long mask = 1UL << (nr & 0x3f);
3558 -static __inline__ void change_bit(unsigned long nr, volatile void *addr)
3559 +static __inline__ void change_bit(unsigned long nr, volatile unsigned long *addr)
3562 unsigned long mask = 1UL << (nr & 0x3f);
3567 -static __inline__ int test_and_set_bit(unsigned long nr, volatile void *addr)
3568 +static __inline__ int test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
3570 unsigned long old, t;
3571 unsigned long mask = 1UL << (nr & 0x3f);
3573 return (old & mask) != 0;
3576 -static __inline__ int test_and_clear_bit(unsigned long nr, volatile void *addr)
3577 +static __inline__ int test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
3579 unsigned long old, t;
3580 unsigned long mask = 1UL << (nr & 0x3f);
3582 return (old & mask) != 0;
3585 -static __inline__ int test_and_change_bit(unsigned long nr, volatile void *addr)
3586 +static __inline__ int test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
3588 unsigned long old, t;
3589 unsigned long mask = 1UL << (nr & 0x3f);
3592 * non-atomic versions
3594 -static __inline__ void __set_bit(unsigned long nr, volatile void *addr)
3595 +static __inline__ void __set_bit(unsigned long nr, volatile unsigned long *addr)
3597 unsigned long mask = 1UL << (nr & 0x3f);
3598 unsigned long *p = ((unsigned long *)addr) + (nr >> 6);
3603 -static __inline__ void __clear_bit(unsigned long nr, volatile void *addr)
3604 +static __inline__ void __clear_bit(unsigned long nr, volatile unsigned long *addr)
3606 unsigned long mask = 1UL << (nr & 0x3f);
3607 unsigned long *p = ((unsigned long *)addr) + (nr >> 6);
3612 -static __inline__ void __change_bit(unsigned long nr, volatile void *addr)
3613 +static __inline__ void __change_bit(unsigned long nr, volatile unsigned long *addr)
3615 unsigned long mask = 1UL << (nr & 0x3f);
3616 unsigned long *p = ((unsigned long *)addr) + (nr >> 6);
3621 -static __inline__ int __test_and_set_bit(unsigned long nr, volatile void *addr)
3622 +static __inline__ int __test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
3624 unsigned long mask = 1UL << (nr & 0x3f);
3625 unsigned long *p = ((unsigned long *)addr) + (nr >> 6);
3627 return (old & mask) != 0;
3630 -static __inline__ int __test_and_clear_bit(unsigned long nr, volatile void *addr)
3631 +static __inline__ int __test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
3633 unsigned long mask = 1UL << (nr & 0x3f);
3634 unsigned long *p = ((unsigned long *)addr) + (nr >> 6);
3636 return (old & mask) != 0;
3639 -static __inline__ int __test_and_change_bit(unsigned long nr, volatile void *addr)
3640 +static __inline__ int __test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
3642 unsigned long mask = 1UL << (nr & 0x3f);
3643 unsigned long *p = ((unsigned long *)addr) + (nr >> 6);
3644 diff -urN linux-2.4.24.org/include/asm-s390/bitops.h linux-2.4.24/include/asm-s390/bitops.h
3645 --- linux-2.4.24.org/include/asm-s390/bitops.h 2004-02-04 20:48:24.809883809 +0100
3646 +++ linux-2.4.24/include/asm-s390/bitops.h 2004-02-04 20:52:53.990894989 +0100
3647 @@ -47,272 +47,217 @@
3648 extern const char _oi_bitmap[];
3649 extern const char _ni_bitmap[];
3650 extern const char _zb_findmap[];
3651 +extern const char _sb_findmap[];
3655 * SMP save set_bit routine based on compare and swap (CS)
3657 -static __inline__ void set_bit_cs(int nr, volatile void * addr)
3658 +static inline void set_bit_cs(int nr, volatile void *ptr)
3660 - unsigned long bits, mask;
3661 - __asm__ __volatile__(
3662 + unsigned long addr, old, new, mask;
3664 + addr = (unsigned long) ptr;
3666 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */
3667 - " nr %2,%1\n" /* isolate last 2 bits of address */
3668 - " xr %1,%2\n" /* make addr % 4 == 0 */
3670 - " ar %0,%2\n" /* add alignement to bitnr */
3671 + addr ^= addr & 3; /* align address to 4 */
3672 + nr += (addr & 3) << 3; /* add alignment to bit number */
3675 - " nr %2,%0\n" /* make shift value */
3679 - " la %1,0(%0,%1)\n" /* calc. address for CS */
3680 - " sll %3,0(%2)\n" /* make OR mask */
3682 - "0: lr %2,%0\n" /* CS loop starts here */
3683 - " or %2,%3\n" /* set bit */
3684 - " cs %0,%2,0(%1)\n"
3686 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
3687 - : "cc", "memory" );
3688 + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */
3689 + mask = 1UL << (nr & 31); /* make OR mask */
3694 + " cs %0,%1,0(%4)\n"
3696 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr)
3697 + : "d" (mask), "a" (addr)
3702 * SMP save clear_bit routine based on compare and swap (CS)
3704 -static __inline__ void clear_bit_cs(int nr, volatile void * addr)
3705 +static inline void clear_bit_cs(int nr, volatile void *ptr)
3707 - static const int minusone = -1;
3708 - unsigned long bits, mask;
3709 - __asm__ __volatile__(
3710 + unsigned long addr, old, new, mask;
3712 + addr = (unsigned long) ptr;
3714 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */
3715 - " nr %2,%1\n" /* isolate last 2 bits of address */
3716 - " xr %1,%2\n" /* make addr % 4 == 0 */
3718 - " ar %0,%2\n" /* add alignement to bitnr */
3719 + addr ^= addr & 3; /* align address to 4 */
3720 + nr += (addr & 3) << 3; /* add alignment to bit number */
3723 - " nr %2,%0\n" /* make shift value */
3727 - " la %1,0(%0,%1)\n" /* calc. address for CS */
3729 - " x %3,%4\n" /* make AND mask */
3731 - "0: lr %2,%0\n" /* CS loop starts here */
3732 - " nr %2,%3\n" /* clear bit */
3733 - " cs %0,%2,0(%1)\n"
3735 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask)
3736 - : "m" (minusone) : "cc", "memory" );
3737 + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */
3738 + mask = ~(1UL << (nr & 31)); /* make AND mask */
3743 + " cs %0,%1,0(%4)\n"
3745 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr)
3746 + : "d" (mask), "a" (addr)
3751 * SMP save change_bit routine based on compare and swap (CS)
3753 -static __inline__ void change_bit_cs(int nr, volatile void * addr)
3754 +static inline void change_bit_cs(int nr, volatile void *ptr)
3756 - unsigned long bits, mask;
3757 - __asm__ __volatile__(
3758 + unsigned long addr, old, new, mask;
3760 + addr = (unsigned long) ptr;
3762 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */
3763 - " nr %2,%1\n" /* isolate last 2 bits of address */
3764 - " xr %1,%2\n" /* make addr % 4 == 0 */
3766 - " ar %0,%2\n" /* add alignement to bitnr */
3767 + addr ^= addr & 3; /* align address to 4 */
3768 + nr += (addr & 3) << 3; /* add alignment to bit number */
3771 - " nr %2,%0\n" /* make shift value */
3775 - " la %1,0(%0,%1)\n" /* calc. address for CS */
3776 - " sll %3,0(%2)\n" /* make XR mask */
3778 - "0: lr %2,%0\n" /* CS loop starts here */
3779 - " xr %2,%3\n" /* change bit */
3780 - " cs %0,%2,0(%1)\n"
3782 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
3783 - : "cc", "memory" );
3784 + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */
3785 + mask = 1UL << (nr & 31); /* make XOR mask */
3790 + " cs %0,%1,0(%4)\n"
3792 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr)
3793 + : "d" (mask), "a" (addr)
3798 * SMP save test_and_set_bit routine based on compare and swap (CS)
3800 -static __inline__ int test_and_set_bit_cs(int nr, volatile void * addr)
3801 +static inline int test_and_set_bit_cs(int nr, volatile void *ptr)
3803 - unsigned long bits, mask;
3804 - __asm__ __volatile__(
3805 + unsigned long addr, old, new, mask;
3807 + addr = (unsigned long) ptr;
3809 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */
3810 - " nr %2,%1\n" /* isolate last 2 bits of address */
3811 - " xr %1,%2\n" /* make addr % 4 == 0 */
3813 - " ar %0,%2\n" /* add alignement to bitnr */
3814 + addr ^= addr & 3; /* align address to 4 */
3815 + nr += (addr & 3) << 3; /* add alignment to bit number */
3818 - " nr %2,%0\n" /* make shift value */
3822 - " la %1,0(%0,%1)\n" /* calc. address for CS */
3823 - " sll %3,0(%2)\n" /* make OR mask */
3825 - "0: lr %2,%0\n" /* CS loop starts here */
3826 - " or %2,%3\n" /* set bit */
3827 - " cs %0,%2,0(%1)\n"
3829 - " nr %0,%3\n" /* isolate old bit */
3830 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
3831 - : "cc", "memory" );
3833 + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */
3834 + mask = 1UL << (nr & 31); /* make OR/test mask */
3839 + " cs %0,%1,0(%4)\n"
3841 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr)
3842 + : "d" (mask), "a" (addr)
3844 + return (old & mask) != 0;
3848 * SMP save test_and_clear_bit routine based on compare and swap (CS)
3850 -static __inline__ int test_and_clear_bit_cs(int nr, volatile void * addr)
3851 +static inline int test_and_clear_bit_cs(int nr, volatile void *ptr)
3853 - static const int minusone = -1;
3854 - unsigned long bits, mask;
3855 - __asm__ __volatile__(
3856 + unsigned long addr, old, new, mask;
3858 + addr = (unsigned long) ptr;
3860 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */
3861 - " nr %2,%1\n" /* isolate last 2 bits of address */
3862 - " xr %1,%2\n" /* make addr % 4 == 0 */
3864 - " ar %0,%2\n" /* add alignement to bitnr */
3865 + addr ^= addr & 3; /* align address to 4 */
3866 + nr += (addr & 3) << 3; /* add alignment to bit number */
3869 - " nr %2,%0\n" /* make shift value */
3873 - " la %1,0(%0,%1)\n" /* calc. address for CS */
3876 - " x %3,%4\n" /* make AND mask */
3877 - "0: lr %2,%0\n" /* CS loop starts here */
3878 - " nr %2,%3\n" /* clear bit */
3879 - " cs %0,%2,0(%1)\n"
3882 - " nr %0,%3\n" /* isolate old bit */
3883 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask)
3884 - : "m" (minusone) : "cc", "memory" );
3886 + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */
3887 + mask = ~(1UL << (nr & 31)); /* make AND mask */
3892 + " cs %0,%1,0(%4)\n"
3894 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr)
3895 + : "d" (mask), "a" (addr)
3897 + return (old ^ new) != 0;
3901 * SMP save test_and_change_bit routine based on compare and swap (CS)
3903 -static __inline__ int test_and_change_bit_cs(int nr, volatile void * addr)
3904 +static inline int test_and_change_bit_cs(int nr, volatile void *ptr)
3906 - unsigned long bits, mask;
3907 - __asm__ __volatile__(
3908 + unsigned long addr, old, new, mask;
3910 + addr = (unsigned long) ptr;
3912 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */
3913 - " nr %2,%1\n" /* isolate last 2 bits of address */
3914 - " xr %1,%2\n" /* make addr % 4 == 0 */
3916 - " ar %0,%2\n" /* add alignement to bitnr */
3917 + addr ^= addr & 3; /* align address to 4 */
3918 + nr += (addr & 3) << 3; /* add alignment to bit number */
3921 - " nr %2,%0\n" /* make shift value */
3925 - " la %1,0(%0,%1)\n" /* calc. address for CS */
3926 - " sll %3,0(%2)\n" /* make OR mask */
3928 - "0: lr %2,%0\n" /* CS loop starts here */
3929 - " xr %2,%3\n" /* change bit */
3930 - " cs %0,%2,0(%1)\n"
3932 - " nr %0,%3\n" /* isolate old bit */
3933 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
3934 - : "cc", "memory" );
3936 + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */
3937 + mask = 1UL << (nr & 31); /* make XOR mask */
3942 + " cs %0,%1,0(%4)\n"
3944 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr)
3945 + : "d" (mask), "a" (addr)
3947 + return (old & mask) != 0;
3949 #endif /* CONFIG_SMP */
3952 * fast, non-SMP set_bit routine
3954 -static __inline__ void __set_bit(int nr, volatile void * addr)
3955 +static inline void __set_bit(int nr, volatile void *ptr)
3957 - unsigned long reg1, reg2;
3958 - __asm__ __volatile__(
3964 - " la %1,0(%1,%3)\n"
3965 - " la %0,0(%0,%4)\n"
3966 - " oc 0(1,%1),0(%0)"
3967 - : "=&a" (reg1), "=&a" (reg2)
3968 - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" );
3971 -static __inline__ void
3972 -__constant_set_bit(const int nr, volatile void * addr)
3976 - __asm__ __volatile__ ("la 1,%0\n\t"
3978 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
3979 - : : "1", "cc", "memory");
3982 - __asm__ __volatile__ ("la 1,%0\n\t"
3984 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
3985 - : : "1", "cc", "memory" );
3988 - __asm__ __volatile__ ("la 1,%0\n\t"
3990 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
3991 - : : "1", "cc", "memory" );
3994 - __asm__ __volatile__ ("la 1,%0\n\t"
3996 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
3997 - : : "1", "cc", "memory" );
4000 - __asm__ __volatile__ ("la 1,%0\n\t"
4002 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4003 - : : "1", "cc", "memory" );
4006 - __asm__ __volatile__ ("la 1,%0\n\t"
4008 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4009 - : : "1", "cc", "memory" );
4012 - __asm__ __volatile__ ("la 1,%0\n\t"
4014 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4015 - : : "1", "cc", "memory" );
4018 - __asm__ __volatile__ ("la 1,%0\n\t"
4020 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4021 - : : "1", "cc", "memory" );
4024 + unsigned long addr;
4026 + addr = (unsigned long) ptr + ((nr ^ 24) >> 3);
4027 + asm volatile("oc 0(1,%1),0(%2)"
4028 + : "+m" (*(char *) addr)
4029 + : "a" (addr), "a" (_oi_bitmap + (nr & 7))
4034 +__constant_set_bit(const int nr, volatile void *ptr)
4036 + unsigned long addr;
4038 + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 3);
4041 + asm volatile ("oi 0(%1),0x01"
4042 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4045 + asm volatile ("oi 0(%1),0x02"
4046 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4049 + asm volatile ("oi 0(%1),0x04"
4050 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4053 + asm volatile ("oi 0(%1),0x08"
4054 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4057 + asm volatile ("oi 0(%1),0x10"
4058 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4061 + asm volatile ("oi 0(%1),0x20"
4062 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4065 + asm volatile ("oi 0(%1),0x40"
4066 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4069 + asm volatile ("oi 0(%1),0x80"
4070 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4075 #define set_bit_simple(nr,addr) \
4076 @@ -323,76 +268,58 @@
4078 * fast, non-SMP clear_bit routine
4080 -static __inline__ void
4081 -__clear_bit(int nr, volatile void * addr)
4083 +__clear_bit(int nr, volatile void *ptr)
4085 - unsigned long reg1, reg2;
4086 - __asm__ __volatile__(
4092 - " la %1,0(%1,%3)\n"
4093 - " la %0,0(%0,%4)\n"
4094 - " nc 0(1,%1),0(%0)"
4095 - : "=&a" (reg1), "=&a" (reg2)
4096 - : "r" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" );
4099 -static __inline__ void
4100 -__constant_clear_bit(const int nr, volatile void * addr)
4104 - __asm__ __volatile__ ("la 1,%0\n\t"
4106 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4107 - : : "1", "cc", "memory" );
4110 - __asm__ __volatile__ ("la 1,%0\n\t"
4112 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4113 - : : "1", "cc", "memory" );
4116 - __asm__ __volatile__ ("la 1,%0\n\t"
4118 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4119 - : : "1", "cc", "memory" );
4122 - __asm__ __volatile__ ("la 1,%0\n\t"
4124 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4125 - : : "1", "cc", "memory" );
4128 - __asm__ __volatile__ ("la 1,%0\n\t"
4130 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4131 - : : "cc", "memory" );
4134 - __asm__ __volatile__ ("la 1,%0\n\t"
4136 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4137 - : : "1", "cc", "memory" );
4140 - __asm__ __volatile__ ("la 1,%0\n\t"
4142 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4143 - : : "1", "cc", "memory" );
4146 - __asm__ __volatile__ ("la 1,%0\n\t"
4148 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4149 - : : "1", "cc", "memory" );
4152 + unsigned long addr;
4154 + addr = (unsigned long) ptr + ((nr ^ 24) >> 3);
4155 + asm volatile("nc 0(1,%1),0(%2)"
4156 + : "+m" (*(char *) addr)
4157 + : "a" (addr), "a" (_ni_bitmap + (nr & 7))
4162 +__constant_clear_bit(const int nr, volatile void *ptr)
4164 + unsigned long addr;
4166 + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 3);
4169 + asm volatile ("ni 0(%1),0xFE"
4170 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4173 + asm volatile ("ni 0(%1),0xFD"
4174 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4177 + asm volatile ("ni 0(%1),0xFB"
4178 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4181 + asm volatile ("ni 0(%1),0xF7"
4182 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4185 + asm volatile ("ni 0(%1),0xEF"
4186 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4189 + asm volatile ("ni 0(%1),0xDF"
4190 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4193 + asm volatile ("ni 0(%1),0xBF"
4194 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4197 + asm volatile ("ni 0(%1),0x7F"
4198 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4203 #define clear_bit_simple(nr,addr) \
4204 @@ -403,75 +330,57 @@
4206 * fast, non-SMP change_bit routine
4208 -static __inline__ void __change_bit(int nr, volatile void * addr)
4209 +static inline void __change_bit(int nr, volatile void *ptr)
4211 - unsigned long reg1, reg2;
4212 - __asm__ __volatile__(
4218 - " la %1,0(%1,%3)\n"
4219 - " la %0,0(%0,%4)\n"
4220 - " xc 0(1,%1),0(%0)"
4221 - : "=&a" (reg1), "=&a" (reg2)
4222 - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" );
4225 -static __inline__ void
4226 -__constant_change_bit(const int nr, volatile void * addr)
4230 - __asm__ __volatile__ ("la 1,%0\n\t"
4232 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4233 - : : "cc", "memory" );
4236 - __asm__ __volatile__ ("la 1,%0\n\t"
4238 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4239 - : : "cc", "memory" );
4242 - __asm__ __volatile__ ("la 1,%0\n\t"
4244 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4245 - : : "cc", "memory" );
4248 - __asm__ __volatile__ ("la 1,%0\n\t"
4250 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4251 - : : "cc", "memory" );
4254 - __asm__ __volatile__ ("la 1,%0\n\t"
4256 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4257 - : : "cc", "memory" );
4260 - __asm__ __volatile__ ("la 1,%0\n\t"
4262 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4263 - : : "1", "cc", "memory" );
4266 - __asm__ __volatile__ ("la 1,%0\n\t"
4268 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4269 - : : "1", "cc", "memory" );
4272 - __asm__ __volatile__ ("la 1,%0\n\t"
4274 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4275 - : : "1", "cc", "memory" );
4278 + unsigned long addr;
4280 + addr = (unsigned long) ptr + ((nr ^ 24) >> 3);
4281 + asm volatile("xc 0(1,%1),0(%2)"
4282 + : "+m" (*(char *) addr)
4283 + : "a" (addr), "a" (_oi_bitmap + (nr & 7))
4288 +__constant_change_bit(const int nr, volatile void *ptr)
4290 + unsigned long addr;
4292 + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 3);
4295 + asm volatile ("xi 0(%1),0x01"
4296 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4299 + asm volatile ("xi 0(%1),0x02"
4300 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4303 + asm volatile ("xi 0(%1),0x04"
4304 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4307 + asm volatile ("xi 0(%1),0x08"
4308 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4311 + asm volatile ("xi 0(%1),0x10"
4312 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4315 + asm volatile ("xi 0(%1),0x20"
4316 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4319 + asm volatile ("xi 0(%1),0x40"
4320 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4323 + asm volatile ("xi 0(%1),0x80"
4324 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4329 #define change_bit_simple(nr,addr) \
4330 @@ -482,74 +391,54 @@
4332 * fast, non-SMP test_and_set_bit routine
4334 -static __inline__ int test_and_set_bit_simple(int nr, volatile void * addr)
4335 +static inline int test_and_set_bit_simple(int nr, volatile void *ptr)
4337 - unsigned long reg1, reg2;
4339 - __asm__ __volatile__(
4345 - " la %1,0(%1,%4)\n"
4348 - " la %2,0(%2,%5)\n"
4349 - " oc 0(1,%1),0(%2)"
4350 - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2)
4351 - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" );
4352 - return oldbit & 1;
4353 + unsigned long addr;
4356 + addr = (unsigned long) ptr + ((nr ^ 24) >> 3);
4357 + ch = *(unsigned char *) addr;
4358 + asm volatile("oc 0(1,%1),0(%2)"
4359 + : "+m" (*(char *) addr)
4360 + : "a" (addr), "a" (_oi_bitmap + (nr & 7))
4362 + return (ch >> (nr & 7)) & 1;
4364 #define __test_and_set_bit(X,Y) test_and_set_bit_simple(X,Y)
4367 * fast, non-SMP test_and_clear_bit routine
4369 -static __inline__ int test_and_clear_bit_simple(int nr, volatile void * addr)
4370 +static inline int test_and_clear_bit_simple(int nr, volatile void *ptr)
4372 - unsigned long reg1, reg2;
4374 + unsigned long addr;
4377 - __asm__ __volatile__(
4383 - " la %1,0(%1,%4)\n"
4386 - " la %2,0(%2,%5)\n"
4387 - " nc 0(1,%1),0(%2)"
4388 - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2)
4389 - : "r" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" );
4390 - return oldbit & 1;
4391 + addr = (unsigned long) ptr + ((nr ^ 24) >> 3);
4392 + ch = *(unsigned char *) addr;
4393 + asm volatile("nc 0(1,%1),0(%2)"
4394 + : "+m" (*(char *) addr)
4395 + : "a" (addr), "a" (_ni_bitmap + (nr & 7))
4397 + return (ch >> (nr & 7)) & 1;
4399 #define __test_and_clear_bit(X,Y) test_and_clear_bit_simple(X,Y)
4402 * fast, non-SMP test_and_change_bit routine
4404 -static __inline__ int test_and_change_bit_simple(int nr, volatile void * addr)
4405 +static inline int test_and_change_bit_simple(int nr, volatile void *ptr)
4407 - unsigned long reg1, reg2;
4409 + unsigned long addr;
4412 - __asm__ __volatile__(
4418 - " la %1,0(%1,%4)\n"
4421 - " la %2,0(%2,%5)\n"
4422 - " xc 0(1,%1),0(%2)"
4423 - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2)
4424 - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" );
4425 - return oldbit & 1;
4426 + addr = (unsigned long) ptr + ((nr ^ 24) >> 3);
4427 + ch = *(unsigned char *) addr;
4428 + asm volatile("xc 0(1,%1),0(%2)"
4429 + : "+m" (*(char *) addr)
4430 + : "a" (addr), "a" (_oi_bitmap + (nr & 7))
4432 + return (ch >> (nr & 7)) & 1;
4434 #define __test_and_change_bit(X,Y) test_and_change_bit_simple(X,Y)
4436 @@ -574,25 +463,17 @@
4437 * This routine doesn't need to be atomic.
4440 -static __inline__ int __test_bit(int nr, volatile void * addr)
4441 +static inline int __test_bit(int nr, volatile void *ptr)
4443 - unsigned long reg1, reg2;
4445 + unsigned long addr;
4448 - __asm__ __volatile__(
4454 - " ic %0,0(%2,%4)\n"
4456 - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2)
4457 - : "r" (nr), "a" (addr) : "cc" );
4458 - return oldbit & 1;
4459 + addr = (unsigned long) ptr + ((nr ^ 24) >> 3);
4460 + ch = *(unsigned char *) addr;
4461 + return (ch >> (nr & 7)) & 1;
4464 -static __inline__ int __constant_test_bit(int nr, volatile void * addr) {
4465 +static inline int __constant_test_bit(int nr, volatile void * addr) {
4466 return (((volatile char *) addr)[(nr>>3)^3] & (1<<(nr&7))) != 0;
4471 * Find-bit routines..
4473 -static __inline__ int find_first_zero_bit(void * addr, unsigned size)
4474 +static inline int find_first_zero_bit(void * addr, unsigned size)
4476 unsigned long cmp, count;
4478 @@ -642,7 +523,45 @@
4479 return (res < size) ? res : size;
4482 -static __inline__ int find_next_zero_bit (void * addr, int size, int offset)
4483 +static inline int find_first_bit(void * addr, unsigned size)
4485 + unsigned long cmp, count;
4490 + __asm__(" slr %1,%1\n"
4495 + "0: c %1,0(%0,%4)\n"
4501 + "1: l %2,0(%0,%4)\n"
4504 + " tml %2,0xffff\n"
4508 + "2: tml %2,0x00ff\n"
4513 + " ic %2,0(%2,%5)\n"
4516 + : "=&a" (res), "=&d" (cmp), "=&a" (count)
4517 + : "a" (size), "a" (addr), "a" (&_sb_findmap) : "cc" );
4518 + return (res < size) ? res : size;
4521 +static inline int find_next_zero_bit (void * addr, int size, int offset)
4523 unsigned long * p = ((unsigned long *) addr) + (offset >> 5);
4524 unsigned long bitvec, reg;
4525 @@ -680,11 +599,49 @@
4526 return (offset + res);
4529 +static inline int find_next_bit (void * addr, int size, int offset)
4531 + unsigned long * p = ((unsigned long *) addr) + (offset >> 5);
4532 + unsigned long bitvec, reg;
4533 + int set, bit = offset & 31, res;
4537 + * Look for set bit in first word
4539 + bitvec = (*p) >> bit;
4540 + __asm__(" slr %0,%0\n"
4542 + " tml %1,0xffff\n"
4546 + "0: tml %1,0x00ff\n"
4551 + " ic %1,0(%1,%3)\n"
4553 + : "=&d" (set), "+a" (bitvec), "=&d" (reg)
4554 + : "a" (&_sb_findmap) : "cc" );
4555 + if (set < (32 - bit))
4556 + return set + offset;
4557 + offset += 32 - bit;
4561 + * No set bit yet, search remaining full words for a bit
4563 + res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr));
4564 + return (offset + res);
4568 * ffz = Find First Zero in word. Undefined if no zero exists,
4569 * so code should check against ~0UL first..
4571 -static __inline__ unsigned long ffz(unsigned long word)
4572 +static inline unsigned long ffz(unsigned long word)
4576 @@ -708,40 +665,109 @@
4580 + * __ffs = find first bit in word. Undefined if no bit exists,
4581 + * so code should check against 0UL first..
4583 +static inline unsigned long __ffs(unsigned long word)
4585 + unsigned long reg, result;
4587 + __asm__(" slr %0,%0\n"
4589 + " tml %1,0xffff\n"
4593 + "0: tml %1,0x00ff\n"
4598 + " ic %1,0(%1,%3)\n"
4600 + : "=&d" (result), "+a" (word), "=&d" (reg)
4601 + : "a" (&_sb_findmap) : "cc" );
4606 + * Every architecture must define this function. It's the fastest
4607 + * way of searching a 140-bit bitmap where the first 100 bits are
4608 + * unlikely to be set. It's guaranteed that at least one of the 140
4609 + * bits is cleared.
4611 +static inline int sched_find_first_bit(unsigned long *b)
4613 + return find_first_bit(b, 140);
4617 * ffs: find first bit set. This is defined the same way as
4618 * the libc and compiler builtin ffs routines, therefore
4619 * differs in spirit from the above ffz (man ffs).
4622 -extern int __inline__ ffs (int x)
4623 +extern int inline ffs (int x)
4630 - __asm__(" slr %0,%0\n"
4631 - " tml %1,0xffff\n"
4633 + __asm__(" tml %1,0xffff\n"
4638 "0: tml %1,0x00ff\n"
4643 "1: tml %1,0x000f\n"
4648 "2: tml %1,0x0003\n"
4653 "3: tml %1,0x0001\n"
4657 : "=&d" (r), "+d" (x) : : "cc" );
4663 + * fls: find last bit set.
4665 +extern __inline__ int fls(int x)
4671 + __asm__(" tmh %1,0xffff\n"
4675 + "0: tmh %1,0xff00\n"
4679 + "1: tmh %1,0xf000\n"
4683 + "2: tmh %1,0xc000\n"
4687 + "3: tmh %1,0x8000\n"
4691 + : "+d" (r), "+d" (x) : : "cc" );
4697 #define ext2_set_bit(nr, addr) test_and_set_bit((nr)^24, addr)
4698 #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr)^24, addr)
4699 #define ext2_test_bit(nr, addr) test_bit((nr)^24, addr)
4700 -static __inline__ int ext2_find_first_zero_bit(void *vaddr, unsigned size)
4701 +static inline int ext2_find_first_zero_bit(void *vaddr, unsigned size)
4703 unsigned long cmp, count;
4706 return (res < size) ? res : size;
4709 -static __inline__ int
4711 ext2_find_next_zero_bit(void *vaddr, unsigned size, unsigned offset)
4713 unsigned long *addr = vaddr;
4714 diff -urN linux-2.4.24.org/include/asm-s390x/bitops.h linux-2.4.24/include/asm-s390x/bitops.h
4715 --- linux-2.4.24.org/include/asm-s390x/bitops.h 2004-02-04 20:48:28.470122479 +0100
4716 +++ linux-2.4.24/include/asm-s390x/bitops.h 2004-02-04 20:52:54.030886671 +0100
4717 @@ -51,271 +51,220 @@
4718 extern const char _oi_bitmap[];
4719 extern const char _ni_bitmap[];
4720 extern const char _zb_findmap[];
4721 +extern const char _sb_findmap[];
4725 * SMP save set_bit routine based on compare and swap (CS)
4727 -static __inline__ void set_bit_cs(unsigned long nr, volatile void * addr)
4728 +static inline void set_bit_cs(unsigned long nr, volatile void *ptr)
4730 - unsigned long bits, mask;
4731 - __asm__ __volatile__(
4732 + unsigned long addr, old, new, mask;
4734 + addr = (unsigned long) ptr;
4736 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */
4737 - " ngr %2,%1\n" /* isolate last 2 bits of address */
4738 - " xgr %1,%2\n" /* make addr % 4 == 0 */
4740 - " agr %0,%2\n" /* add alignement to bitnr */
4741 + addr ^= addr & 7; /* align address to 8 */
4742 + nr += (addr & 7) << 3; /* add alignment to bit number */
4745 - " nr %2,%0\n" /* make shift value */
4749 - " la %1,0(%0,%1)\n" /* calc. address for CS */
4750 - " sllg %3,%3,0(%2)\n" /* make OR mask */
4752 - "0: lgr %2,%0\n" /* CS loop starts here */
4753 - " ogr %2,%3\n" /* set bit */
4754 - " csg %0,%2,0(%1)\n"
4756 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
4757 - : "cc", "memory" );
4758 + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */
4759 + mask = 1UL << (nr & 63); /* make OR mask */
4764 + " csg %0,%1,0(%4)\n"
4766 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr)
4767 + : "d" (mask), "a" (addr)
4772 * SMP save clear_bit routine based on compare and swap (CS)
4774 -static __inline__ void clear_bit_cs(unsigned long nr, volatile void * addr)
4775 +static inline void clear_bit_cs(unsigned long nr, volatile void *ptr)
4777 - unsigned long bits, mask;
4778 - __asm__ __volatile__(
4779 + unsigned long addr, old, new, mask;
4781 + addr = (unsigned long) ptr;
4783 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */
4784 - " ngr %2,%1\n" /* isolate last 2 bits of address */
4785 - " xgr %1,%2\n" /* make addr % 4 == 0 */
4787 - " agr %0,%2\n" /* add alignement to bitnr */
4788 + addr ^= addr & 7; /* align address to 8 */
4789 + nr += (addr & 7) << 3; /* add alignment to bit number */
4792 - " nr %2,%0\n" /* make shift value */
4796 - " la %1,0(%0,%1)\n" /* calc. address for CS */
4798 - " rllg %3,%3,0(%2)\n" /* make AND mask */
4800 - "0: lgr %2,%0\n" /* CS loop starts here */
4801 - " ngr %2,%3\n" /* clear bit */
4802 - " csg %0,%2,0(%1)\n"
4804 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
4805 - : "cc", "memory" );
4806 + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */
4807 + mask = ~(1UL << (nr & 63)); /* make AND mask */
4812 + " csg %0,%1,0(%4)\n"
4814 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr)
4815 + : "d" (mask), "a" (addr)
4820 * SMP save change_bit routine based on compare and swap (CS)
4822 -static __inline__ void change_bit_cs(unsigned long nr, volatile void * addr)
4823 +static inline void change_bit_cs(unsigned long nr, volatile void *ptr)
4825 - unsigned long bits, mask;
4826 - __asm__ __volatile__(
4827 + unsigned long addr, old, new, mask;
4829 + addr = (unsigned long) ptr;
4831 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */
4832 - " ngr %2,%1\n" /* isolate last 2 bits of address */
4833 - " xgr %1,%2\n" /* make addr % 4 == 0 */
4835 - " agr %0,%2\n" /* add alignement to bitnr */
4836 + addr ^= addr & 7; /* align address to 8 */
4837 + nr += (addr & 7) << 3; /* add alignment to bit number */
4840 - " nr %2,%0\n" /* make shift value */
4844 - " la %1,0(%0,%1)\n" /* calc. address for CS */
4845 - " sllg %3,%3,0(%2)\n" /* make XR mask */
4847 - "0: lgr %2,%0\n" /* CS loop starts here */
4848 - " xgr %2,%3\n" /* change bit */
4849 - " csg %0,%2,0(%1)\n"
4851 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
4852 - : "cc", "memory" );
4853 + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */
4854 + mask = 1UL << (nr & 63); /* make XOR mask */
4859 + " csg %0,%1,0(%4)\n"
4861 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr)
4862 + : "d" (mask), "a" (addr)
4867 * SMP save test_and_set_bit routine based on compare and swap (CS)
4869 -static __inline__ int
4870 -test_and_set_bit_cs(unsigned long nr, volatile void * addr)
4872 +test_and_set_bit_cs(unsigned long nr, volatile void *ptr)
4874 - unsigned long bits, mask;
4875 - __asm__ __volatile__(
4876 + unsigned long addr, old, new, mask;
4878 + addr = (unsigned long) ptr;
4880 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */
4881 - " ngr %2,%1\n" /* isolate last 2 bits of address */
4882 - " xgr %1,%2\n" /* make addr % 4 == 0 */
4884 - " agr %0,%2\n" /* add alignement to bitnr */
4885 + addr ^= addr & 7; /* align address to 8 */
4886 + nr += (addr & 7) << 3; /* add alignment to bit number */
4889 - " nr %2,%0\n" /* make shift value */
4893 - " la %1,0(%0,%1)\n" /* calc. address for CS */
4894 - " sllg %3,%3,0(%2)\n" /* make OR mask */
4896 - "0: lgr %2,%0\n" /* CS loop starts here */
4897 - " ogr %2,%3\n" /* set bit */
4898 - " csg %0,%2,0(%1)\n"
4900 - " ngr %0,%3\n" /* isolate old bit */
4901 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
4902 - : "cc", "memory" );
4904 + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */
4905 + mask = 1UL << (nr & 63); /* make OR/test mask */
4910 + " csg %0,%1,0(%4)\n"
4912 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr)
4913 + : "d" (mask), "a" (addr)
4915 + return (old & mask) != 0;
4919 * SMP save test_and_clear_bit routine based on compare and swap (CS)
4921 -static __inline__ int
4922 -test_and_clear_bit_cs(unsigned long nr, volatile void * addr)
4924 +test_and_clear_bit_cs(unsigned long nr, volatile void *ptr)
4926 - unsigned long bits, mask;
4927 - __asm__ __volatile__(
4928 + unsigned long addr, old, new, mask;
4930 + addr = (unsigned long) ptr;
4932 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */
4933 - " ngr %2,%1\n" /* isolate last 2 bits of address */
4934 - " xgr %1,%2\n" /* make addr % 4 == 0 */
4936 - " agr %0,%2\n" /* add alignement to bitnr */
4937 + addr ^= addr & 7; /* align address to 8 */
4938 + nr += (addr & 7) << 3; /* add alignment to bit number */
4941 - " nr %2,%0\n" /* make shift value */
4945 - " la %1,0(%0,%1)\n" /* calc. address for CS */
4946 - " rllg %3,%3,0(%2)\n" /* make AND mask */
4948 - "0: lgr %2,%0\n" /* CS loop starts here */
4949 - " ngr %2,%3\n" /* clear bit */
4950 - " csg %0,%2,0(%1)\n"
4952 - " xgr %0,%2\n" /* isolate old bit */
4953 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
4954 - : "cc", "memory" );
4956 + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */
4957 + mask = ~(1UL << (nr & 63)); /* make AND mask */
4962 + " csg %0,%1,0(%4)\n"
4964 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr)
4965 + : "d" (mask), "a" (addr)
4967 + return (old ^ new) != 0;
4971 * SMP save test_and_change_bit routine based on compare and swap (CS)
4973 -static __inline__ int
4974 -test_and_change_bit_cs(unsigned long nr, volatile void * addr)
4976 +test_and_change_bit_cs(unsigned long nr, volatile void *ptr)
4978 - unsigned long bits, mask;
4979 - __asm__ __volatile__(
4980 + unsigned long addr, old, new, mask;
4982 + addr = (unsigned long) ptr;
4984 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */
4985 - " ngr %2,%1\n" /* isolate last 2 bits of address */
4986 - " xgr %1,%2\n" /* make addr % 4 == 0 */
4988 - " agr %0,%2\n" /* add alignement to bitnr */
4989 + addr ^= addr & 7; /* align address to 8 */
4990 + nr += (addr & 7) << 3; /* add alignment to bit number */
4993 - " nr %2,%0\n" /* make shift value */
4997 - " la %1,0(%0,%1)\n" /* calc. address for CS */
4998 - " sllg %3,%3,0(%2)\n" /* make OR mask */
5000 - "0: lgr %2,%0\n" /* CS loop starts here */
5001 - " xgr %2,%3\n" /* change bit */
5002 - " csg %0,%2,0(%1)\n"
5004 - " ngr %0,%3\n" /* isolate old bit */
5005 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
5006 - : "cc", "memory" );
5008 + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */
5009 + mask = 1UL << (nr & 63); /* make XOR mask */
5014 + " csg %0,%1,0(%4)\n"
5016 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr)
5017 + : "d" (mask), "a" (addr)
5019 + return (old & mask) != 0;
5021 #endif /* CONFIG_SMP */
5024 * fast, non-SMP set_bit routine
5026 -static __inline__ void __set_bit(unsigned long nr, volatile void * addr)
5027 +static inline void __set_bit(unsigned long nr, volatile void *ptr)
5029 - unsigned long reg1, reg2;
5030 - __asm__ __volatile__(
5036 - " la %1,0(%1,%3)\n"
5037 - " la %0,0(%0,%4)\n"
5038 - " oc 0(1,%1),0(%0)"
5039 - : "=&a" (reg1), "=&a" (reg2)
5040 - : "a" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" );
5043 -static __inline__ void
5044 -__constant_set_bit(const unsigned long nr, volatile void * addr)
5048 - __asm__ __volatile__ ("la 1,%0\n\t"
5050 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5051 - : : "1", "cc", "memory");
5054 - __asm__ __volatile__ ("la 1,%0\n\t"
5056 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5057 - : : "1", "cc", "memory" );
5060 - __asm__ __volatile__ ("la 1,%0\n\t"
5062 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5063 - : : "1", "cc", "memory" );
5066 - __asm__ __volatile__ ("la 1,%0\n\t"
5068 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5069 - : : "1", "cc", "memory" );
5072 - __asm__ __volatile__ ("la 1,%0\n\t"
5074 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5075 - : : "1", "cc", "memory" );
5078 - __asm__ __volatile__ ("la 1,%0\n\t"
5080 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5081 - : : "1", "cc", "memory" );
5084 - __asm__ __volatile__ ("la 1,%0\n\t"
5086 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5087 - : : "1", "cc", "memory" );
5090 - __asm__ __volatile__ ("la 1,%0\n\t"
5092 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5093 - : : "1", "cc", "memory" );
5096 + unsigned long addr;
5098 + addr = (unsigned long) ptr + ((nr ^ 56) >> 3);
5099 + asm volatile("oc 0(1,%1),0(%2)"
5100 + : "+m" (*(char *) addr)
5101 + : "a" (addr), "a" (_oi_bitmap + (nr & 7))
5106 +__constant_set_bit(const unsigned long nr, volatile void *ptr)
5108 + unsigned long addr;
5110 + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 7);
5113 + asm volatile ("oi 0(%1),0x01"
5114 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5117 + asm volatile ("oi 0(%1),0x02"
5118 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5121 + asm volatile ("oi 0(%1),0x04"
5122 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5125 + asm volatile ("oi 0(%1),0x08"
5126 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5129 + asm volatile ("oi 0(%1),0x10"
5130 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5133 + asm volatile ("oi 0(%1),0x20"
5134 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5137 + asm volatile ("oi 0(%1),0x40"
5138 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5141 + asm volatile ("oi 0(%1),0x80"
5142 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5147 #define set_bit_simple(nr,addr) \
5148 @@ -326,76 +275,58 @@
5150 * fast, non-SMP clear_bit routine
5152 -static __inline__ void
5153 -__clear_bit(unsigned long nr, volatile void * addr)
5155 +__clear_bit(unsigned long nr, volatile void *ptr)
5157 - unsigned long reg1, reg2;
5158 - __asm__ __volatile__(
5164 - " la %1,0(%1,%3)\n"
5165 - " la %0,0(%0,%4)\n"
5166 - " nc 0(1,%1),0(%0)"
5167 - : "=&a" (reg1), "=&a" (reg2)
5168 - : "d" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" );
5171 -static __inline__ void
5172 -__constant_clear_bit(const unsigned long nr, volatile void * addr)
5176 - __asm__ __volatile__ ("la 1,%0\n\t"
5178 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5179 - : : "1", "cc", "memory" );
5182 - __asm__ __volatile__ ("la 1,%0\n\t"
5184 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5185 - : : "1", "cc", "memory" );
5188 - __asm__ __volatile__ ("la 1,%0\n\t"
5190 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5191 - : : "1", "cc", "memory" );
5194 - __asm__ __volatile__ ("la 1,%0\n\t"
5196 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5197 - : : "1", "cc", "memory" );
5200 - __asm__ __volatile__ ("la 1,%0\n\t"
5202 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5203 - : : "cc", "memory" );
5206 - __asm__ __volatile__ ("la 1,%0\n\t"
5208 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5209 - : : "1", "cc", "memory" );
5212 - __asm__ __volatile__ ("la 1,%0\n\t"
5214 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5215 - : : "1", "cc", "memory" );
5218 - __asm__ __volatile__ ("la 1,%0\n\t"
5220 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5221 - : : "1", "cc", "memory" );
5224 + unsigned long addr;
5226 + addr = (unsigned long) ptr + ((nr ^ 56) >> 3);
5227 + asm volatile("nc 0(1,%1),0(%2)"
5228 + : "+m" (*(char *) addr)
5229 + : "a" (addr), "a" (_ni_bitmap + (nr & 7))
5234 +__constant_clear_bit(const unsigned long nr, volatile void *ptr)
5236 + unsigned long addr;
5238 + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 7);
5241 + asm volatile ("ni 0(%1),0xFE"
5242 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5245 + asm volatile ("ni 0(%1),0xFD"
5246 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5249 + asm volatile ("ni 0(%1),0xFB"
5250 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5253 + asm volatile ("ni 0(%1),0xF7"
5254 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5257 + asm volatile ("ni 0(%1),0xEF"
5258 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5261 + asm volatile ("ni 0(%1),0xDF"
5262 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5265 + asm volatile ("ni 0(%1),0xBF"
5266 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5269 + asm volatile ("ni 0(%1),0x7F"
5270 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5275 #define clear_bit_simple(nr,addr) \
5276 @@ -406,75 +337,57 @@
5278 * fast, non-SMP change_bit routine
5280 -static __inline__ void __change_bit(unsigned long nr, volatile void * addr)
5281 +static inline void __change_bit(unsigned long nr, volatile void *ptr)
5283 - unsigned long reg1, reg2;
5284 - __asm__ __volatile__(
5290 - " la %1,0(%1,%3)\n"
5291 - " la %0,0(%0,%4)\n"
5292 - " xc 0(1,%1),0(%0)"
5293 - : "=&a" (reg1), "=&a" (reg2)
5294 - : "d" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" );
5297 -static __inline__ void
5298 -__constant_change_bit(const unsigned long nr, volatile void * addr)
5302 - __asm__ __volatile__ ("la 1,%0\n\t"
5304 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5305 - : : "cc", "memory" );
5308 - __asm__ __volatile__ ("la 1,%0\n\t"
5310 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5311 - : : "cc", "memory" );
5314 - __asm__ __volatile__ ("la 1,%0\n\t"
5316 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5317 - : : "cc", "memory" );
5320 - __asm__ __volatile__ ("la 1,%0\n\t"
5322 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5323 - : : "cc", "memory" );
5326 - __asm__ __volatile__ ("la 1,%0\n\t"
5328 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5329 - : : "cc", "memory" );
5332 - __asm__ __volatile__ ("la 1,%0\n\t"
5334 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5335 - : : "1", "cc", "memory" );
5338 - __asm__ __volatile__ ("la 1,%0\n\t"
5340 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5341 - : : "1", "cc", "memory" );
5344 - __asm__ __volatile__ ("la 1,%0\n\t"
5346 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5347 - : : "1", "cc", "memory" );
5350 + unsigned long addr;
5352 + addr = (unsigned long) ptr + ((nr ^ 56) >> 3);
5353 + asm volatile("xc 0(1,%1),0(%2)"
5354 + : "+m" (*(char *) addr)
5355 + : "a" (addr), "a" (_oi_bitmap + (nr & 7))
5360 +__constant_change_bit(const unsigned long nr, volatile void *ptr)
5362 + unsigned long addr;
5364 + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 7);
5367 + asm volatile ("xi 0(%1),0x01"
5368 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5371 + asm volatile ("xi 0(%1),0x02"
5372 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5375 + asm volatile ("xi 0(%1),0x04"
5376 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5379 + asm volatile ("xi 0(%1),0x08"
5380 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5383 + asm volatile ("xi 0(%1),0x10"
5384 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5387 + asm volatile ("xi 0(%1),0x20"
5388 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5391 + asm volatile ("xi 0(%1),0x40"
5392 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5395 + asm volatile ("xi 0(%1),0x80"
5396 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5401 #define change_bit_simple(nr,addr) \
5402 @@ -485,77 +398,57 @@
5404 * fast, non-SMP test_and_set_bit routine
5406 -static __inline__ int
5407 -test_and_set_bit_simple(unsigned long nr, volatile void * addr)
5409 +test_and_set_bit_simple(unsigned long nr, volatile void *ptr)
5411 - unsigned long reg1, reg2;
5413 - __asm__ __volatile__(
5419 - " la %1,0(%1,%4)\n"
5422 - " la %2,0(%2,%5)\n"
5423 - " oc 0(1,%1),0(%2)"
5424 - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2)
5425 - : "d" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" );
5426 - return oldbit & 1;
5427 + unsigned long addr;
5430 + addr = (unsigned long) ptr + ((nr ^ 56) >> 3);
5431 + ch = *(unsigned char *) addr;
5432 + asm volatile("oc 0(1,%1),0(%2)"
5433 + : "+m" (*(char *) addr)
5434 + : "a" (addr), "a" (_oi_bitmap + (nr & 7))
5436 + return (ch >> (nr & 7)) & 1;
5438 #define __test_and_set_bit(X,Y) test_and_set_bit_simple(X,Y)
5441 * fast, non-SMP test_and_clear_bit routine
5443 -static __inline__ int
5444 -test_and_clear_bit_simple(unsigned long nr, volatile void * addr)
5446 +test_and_clear_bit_simple(unsigned long nr, volatile void *ptr)
5448 - unsigned long reg1, reg2;
5450 + unsigned long addr;
5453 - __asm__ __volatile__(
5459 - " la %1,0(%1,%4)\n"
5462 - " la %2,0(%2,%5)\n"
5463 - " nc 0(1,%1),0(%2)"
5464 - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2)
5465 - : "d" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" );
5466 - return oldbit & 1;
5467 + addr = (unsigned long) ptr + ((nr ^ 56) >> 3);
5468 + ch = *(unsigned char *) addr;
5469 + asm volatile("nc 0(1,%1),0(%2)"
5470 + : "+m" (*(char *) addr)
5471 + : "a" (addr), "a" (_ni_bitmap + (nr & 7))
5473 + return (ch >> (nr & 7)) & 1;
5475 #define __test_and_clear_bit(X,Y) test_and_clear_bit_simple(X,Y)
5478 * fast, non-SMP test_and_change_bit routine
5480 -static __inline__ int
5481 -test_and_change_bit_simple(unsigned long nr, volatile void * addr)
5483 +test_and_change_bit_simple(unsigned long nr, volatile void *ptr)
5485 - unsigned long reg1, reg2;
5487 + unsigned long addr;
5490 - __asm__ __volatile__(
5496 - " la %1,0(%1,%4)\n"
5499 - " la %2,0(%2,%5)\n"
5500 - " xc 0(1,%1),0(%2)"
5501 - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2)
5502 - : "d" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" );
5503 - return oldbit & 1;
5504 + addr = (unsigned long) ptr + ((nr ^ 56) >> 3);
5505 + ch = *(unsigned char *) addr;
5506 + asm volatile("xc 0(1,%1),0(%2)"
5507 + : "+m" (*(char *) addr)
5508 + : "a" (addr), "a" (_oi_bitmap + (nr & 7))
5510 + return (ch >> (nr & 7)) & 1;
5512 #define __test_and_change_bit(X,Y) test_and_change_bit_simple(X,Y)
5514 @@ -580,26 +473,18 @@
5515 * This routine doesn't need to be atomic.
5518 -static __inline__ int __test_bit(unsigned long nr, volatile void * addr)
5519 +static inline int __test_bit(unsigned long nr, volatile void *ptr)
5521 - unsigned long reg1, reg2;
5523 + unsigned long addr;
5526 - __asm__ __volatile__(
5532 - " ic %0,0(%2,%4)\n"
5534 - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2)
5535 - : "d" (nr), "a" (addr) : "cc" );
5536 - return oldbit & 1;
5537 + addr = (unsigned long) ptr + ((nr ^ 56) >> 3);
5538 + ch = *(unsigned char *) addr;
5539 + return (ch >> (nr & 7)) & 1;
5542 -static __inline__ int
5543 -__constant_test_bit(unsigned long nr, volatile void * addr) {
5545 +__constant_test_bit(unsigned long nr, volatile void *addr) {
5546 return (((volatile char *) addr)[(nr>>3)^7] & (1<<(nr&7))) != 0;
5551 * Find-bit routines..
5553 -static __inline__ unsigned long
5554 +static inline unsigned long
5555 find_first_zero_bit(void * addr, unsigned long size)
5557 unsigned long res, cmp, count;
5558 @@ -653,7 +538,49 @@
5559 return (res < size) ? res : size;
5562 -static __inline__ unsigned long
5563 +static inline unsigned long
5564 +find_first_bit(void * addr, unsigned long size)
5566 + unsigned long res, cmp, count;
5570 + __asm__(" slgr %1,%1\n"
5575 + "0: cg %1,0(%0,%4)\n"
5581 + "1: lg %2,0(%0,%4)\n"
5586 + " srlg %2,%2,32\n"
5587 + "2: lghi %1,0xff\n"
5588 + " tmll %2,0xffff\n"
5592 + "3: tmll %2,0x00ff\n"
5597 + " ic %2,0(%2,%5)\n"
5600 + : "=&a" (res), "=&d" (cmp), "=&a" (count)
5601 + : "a" (size), "a" (addr), "a" (&_sb_findmap) : "cc" );
5602 + return (res < size) ? res : size;
5605 +static inline unsigned long
5606 find_next_zero_bit (void * addr, unsigned long size, unsigned long offset)
5608 unsigned long * p = ((unsigned long *) addr) + (offset >> 6);
5609 @@ -697,14 +624,56 @@
5610 return (offset + res);
5613 +static inline unsigned long
5614 +find_next_bit (void * addr, unsigned long size, unsigned long offset)
5616 + unsigned long * p = ((unsigned long *) addr) + (offset >> 6);
5617 + unsigned long bitvec, reg;
5618 + unsigned long set, bit = offset & 63, res;
5622 + * Look for zero in first word
5624 + bitvec = (*p) >> bit;
5625 + __asm__(" slgr %0,%0\n"
5629 + " srlg %1,%1,32\n"
5630 + "0: lghi %2,0xff\n"
5631 + " tmll %1,0xffff\n"
5634 + " srlg %1,%1,16\n"
5635 + "1: tmll %1,0x00ff\n"
5640 + " ic %1,0(%1,%3)\n"
5642 + : "=&d" (set), "+a" (bitvec), "=&d" (reg)
5643 + : "a" (&_sb_findmap) : "cc" );
5644 + if (set < (64 - bit))
5645 + return set + offset;
5646 + offset += 64 - bit;
5650 + * No set bit yet, search remaining full words for a bit
5652 + res = find_first_bit (p, size - 64 * (p - (unsigned long *) addr));
5653 + return (offset + res);
5657 * ffz = Find First Zero in word. Undefined if no zero exists,
5658 * so code should check against ~0UL first..
5660 -static __inline__ unsigned long ffz(unsigned long word)
5661 +static inline unsigned long ffz(unsigned long word)
5663 - unsigned long reg;
5665 + unsigned long reg, result;
5667 __asm__(" lhi %2,-1\n"
5669 @@ -730,40 +699,112 @@
5673 + * __ffs = find first bit in word. Undefined if no bit exists,
5674 + * so code should check against 0UL first..
5676 +static inline unsigned long __ffs (unsigned long word)
5678 + unsigned long reg, result;
5680 + __asm__(" slgr %0,%0\n"
5684 + " srlg %1,%1,32\n"
5685 + "0: lghi %2,0xff\n"
5686 + " tmll %1,0xffff\n"
5689 + " srlg %1,%1,16\n"
5690 + "1: tmll %1,0x00ff\n"
5695 + " ic %1,0(%1,%3)\n"
5697 + : "=&d" (result), "+a" (word), "=&d" (reg)
5698 + : "a" (&_sb_findmap) : "cc" );
5703 + * Every architecture must define this function. It's the fastest
5704 + * way of searching a 140-bit bitmap where the first 100 bits are
5705 + * unlikely to be set. It's guaranteed that at least one of the 140
5706 + * bits is cleared.
5708 +static inline int sched_find_first_bit(unsigned long *b)
5710 + return find_first_bit(b, 140);
5714 * ffs: find first bit set. This is defined the same way as
5715 * the libc and compiler builtin ffs routines, therefore
5716 * differs in spirit from the above ffz (man ffs).
5719 -extern int __inline__ ffs (int x)
5720 +extern int inline ffs (int x)
5727 - __asm__(" slr %0,%0\n"
5728 - " tml %1,0xffff\n"
5730 + __asm__(" tml %1,0xffff\n"
5735 "0: tml %1,0x00ff\n"
5740 "1: tml %1,0x000f\n"
5745 "2: tml %1,0x0003\n"
5750 "3: tml %1,0x0001\n"
5754 : "=&d" (r), "+d" (x) : : "cc" );
5760 + * fls: find last bit set.
5762 +extern __inline__ int fls(int x)
5768 + __asm__(" tmh %1,0xffff\n"
5772 + "0: tmh %1,0xff00\n"
5776 + "1: tmh %1,0xf000\n"
5780 + "2: tmh %1,0xc000\n"
5784 + "3: tmh %1,0x8000\n"
5788 + : "+d" (r), "+d" (x) : : "cc" );
5794 #define ext2_set_bit(nr, addr) test_and_set_bit((nr)^56, addr)
5795 #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr)^56, addr)
5796 #define ext2_test_bit(nr, addr) test_bit((nr)^56, addr)
5797 -static __inline__ unsigned long
5798 +static inline unsigned long
5799 ext2_find_first_zero_bit(void *vaddr, unsigned long size)
5801 unsigned long res, cmp, count;
5803 return (res < size) ? res : size;
5806 -static __inline__ unsigned long
5807 +static inline unsigned long
5808 ext2_find_next_zero_bit(void *vaddr, unsigned long size, unsigned long offset)
5810 unsigned long *addr = vaddr;
5811 diff -urN linux-2.4.24.org/include/asm-sparc/bitops.h linux-2.4.24/include/asm-sparc/bitops.h
5812 --- linux-2.4.24.org/include/asm-sparc/bitops.h 2004-02-04 20:47:50.760965997 +0100
5813 +++ linux-2.4.24/include/asm-sparc/bitops.h 2004-02-04 20:52:54.074877521 +0100
5814 @@ -231,6 +231,63 @@
5819 + * __ffs - find first bit in word.
5820 + * @word: The word to search
5822 + * Undefined if no bit exists, so code should check against 0 first.
5824 +static __inline__ int __ffs(unsigned long word)
5828 + if ((word & 0xffff) == 0) {
5832 + if ((word & 0xff) == 0) {
5836 + if ((word & 0xf) == 0) {
5840 + if ((word & 0x3) == 0) {
5844 + if ((word & 0x1) == 0)
5850 + * Every architecture must define this function. It's the fastest
5851 + * way of searching a 140-bit bitmap where the first 100 bits are
5852 + * unlikely to be set. It's guaranteed that at least one of the 140
5853 + * bits is cleared.
5855 +static __inline__ int sched_find_first_bit(unsigned long *b)
5858 + if (unlikely(b[0]))
5859 + return __ffs(b[0]);
5860 + if (unlikely(b[1]))
5861 + return __ffs(b[1]) + 32;
5862 + if (unlikely(b[2]))
5863 + return __ffs(b[2]) + 64;
5865 + return __ffs(b[3]) + 96;
5866 + return __ffs(b[4]) + 128;
5870 + * fls: find last bit set.
5873 +#define fls(x) generic_fls(x)
5876 * ffs: find first bit set. This is defined the same way as
5877 * the libc and compiler builtin ffs routines, therefore
5878 @@ -296,6 +353,32 @@
5879 #define find_first_zero_bit(addr, size) \
5880 find_next_zero_bit((addr), (size), 0)
5883 + * find_next_bit - find the first set bit in a memory region
5884 + * @addr: The address to base the search on
5885 + * @offset: The bitnumber to start searching at
5886 + * @size: The maximum size to search
5888 + * Scheduler induced bitop, do not use.
5890 +static inline int find_next_bit(unsigned long *addr, int size, int offset)
5892 + unsigned long *p = addr + (offset >> 5);
5893 + int num = offset & ~0x1f;
5894 + unsigned long word;
5897 + word &= ~((1 << (offset & 0x1f)) - 1);
5898 + while (num < size) {
5900 + return __ffs(word) + num;
5908 static inline int test_le_bit(int nr, __const__ void * addr)
5910 __const__ unsigned char *ADDR = (__const__ unsigned char *) addr;
5911 diff -urN linux-2.4.24.org/include/asm-sparc/system.h linux-2.4.24/include/asm-sparc/system.h
5912 --- linux-2.4.24.org/include/asm-sparc/system.h 2004-02-04 20:47:50.644990120 +0100
5913 +++ linux-2.4.24/include/asm-sparc/system.h 2004-02-04 20:52:54.110870035 +0100
5916 * SWITCH_ENTER and SWITH_DO_LAZY_FPU do not work yet (e.g. SMP does not work)
5918 -#define prepare_to_switch() do { \
5919 +#define prepare_arch_switch(rq, next) do { \
5920 __asm__ __volatile__( \
5921 ".globl\tflush_patch_switch\nflush_patch_switch:\n\t" \
5922 "save %sp, -0x40, %sp; save %sp, -0x40, %sp; save %sp, -0x40, %sp\n\t" \
5924 "save %sp, -0x40, %sp\n\t" \
5925 "restore; restore; restore; restore; restore; restore; restore"); \
5927 +#define finish_arch_switch(rq, next) do{ }while(0)
5928 +#define task_running(rq, p) ((rq)->curr == (p))
5930 /* Much care has gone into this code, do not touch it.
5932 diff -urN linux-2.4.24.org/include/asm-sparc64/bitops.h linux-2.4.24/include/asm-sparc64/bitops.h
5933 --- linux-2.4.24.org/include/asm-sparc64/bitops.h 2004-02-04 20:48:02.155595906 +0100
5934 +++ linux-2.4.24/include/asm-sparc64/bitops.h 2004-02-04 20:52:54.137864420 +0100
5938 * bitops.h: Bit string operations on the V9.
5940 * Copyright 1996, 1997 David S. Miller (davem@caip.rutgers.edu)
5942 #ifndef _SPARC64_BITOPS_H
5943 #define _SPARC64_BITOPS_H
5945 +#include <linux/compiler.h>
5946 #include <asm/byteorder.h>
5948 -extern long ___test_and_set_bit(unsigned long nr, volatile void *addr);
5949 -extern long ___test_and_clear_bit(unsigned long nr, volatile void *addr);
5950 -extern long ___test_and_change_bit(unsigned long nr, volatile void *addr);
5951 +extern long ___test_and_set_bit(unsigned long nr, volatile unsigned long *addr);
5952 +extern long ___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr);
5953 +extern long ___test_and_change_bit(unsigned long nr, volatile unsigned long *addr);
5955 #define test_and_set_bit(nr,addr) ({___test_and_set_bit(nr,addr)!=0;})
5956 #define test_and_clear_bit(nr,addr) ({___test_and_clear_bit(nr,addr)!=0;})
5957 @@ -21,109 +22,132 @@
5958 #define change_bit(nr,addr) ((void)___test_and_change_bit(nr,addr))
5960 /* "non-atomic" versions... */
5961 -#define __set_bit(X,Y) \
5962 -do { unsigned long __nr = (X); \
5963 - long *__m = ((long *) (Y)) + (__nr >> 6); \
5964 - *__m |= (1UL << (__nr & 63)); \
5966 -#define __clear_bit(X,Y) \
5967 -do { unsigned long __nr = (X); \
5968 - long *__m = ((long *) (Y)) + (__nr >> 6); \
5969 - *__m &= ~(1UL << (__nr & 63)); \
5971 -#define __change_bit(X,Y) \
5972 -do { unsigned long __nr = (X); \
5973 - long *__m = ((long *) (Y)) + (__nr >> 6); \
5974 - *__m ^= (1UL << (__nr & 63)); \
5976 -#define __test_and_set_bit(X,Y) \
5977 -({ unsigned long __nr = (X); \
5978 - long *__m = ((long *) (Y)) + (__nr >> 6); \
5979 - long __old = *__m; \
5980 - long __mask = (1UL << (__nr & 63)); \
5981 - *__m = (__old | __mask); \
5982 - ((__old & __mask) != 0); \
5984 -#define __test_and_clear_bit(X,Y) \
5985 -({ unsigned long __nr = (X); \
5986 - long *__m = ((long *) (Y)) + (__nr >> 6); \
5987 - long __old = *__m; \
5988 - long __mask = (1UL << (__nr & 63)); \
5989 - *__m = (__old & ~__mask); \
5990 - ((__old & __mask) != 0); \
5992 -#define __test_and_change_bit(X,Y) \
5993 -({ unsigned long __nr = (X); \
5994 - long *__m = ((long *) (Y)) + (__nr >> 6); \
5995 - long __old = *__m; \
5996 - long __mask = (1UL << (__nr & 63)); \
5997 - *__m = (__old ^ __mask); \
5998 - ((__old & __mask) != 0); \
6001 +static __inline__ void __set_bit(int nr, volatile unsigned long *addr)
6003 + volatile unsigned long *m = addr + (nr >> 6);
6005 + *m |= (1UL << (nr & 63));
6008 +static __inline__ void __clear_bit(int nr, volatile unsigned long *addr)
6010 + volatile unsigned long *m = addr + (nr >> 6);
6012 + *m &= ~(1UL << (nr & 63));
6015 +static __inline__ void __change_bit(int nr, volatile unsigned long *addr)
6017 + volatile unsigned long *m = addr + (nr >> 6);
6019 + *m ^= (1UL << (nr & 63));
6022 +static __inline__ int __test_and_set_bit(int nr, volatile unsigned long *addr)
6024 + volatile unsigned long *m = addr + (nr >> 6);
6026 + long mask = (1UL << (nr & 63));
6028 + *m = (old | mask);
6029 + return ((old & mask) != 0);
6032 +static __inline__ int __test_and_clear_bit(int nr, volatile unsigned long *addr)
6034 + volatile unsigned long *m = addr + (nr >> 6);
6036 + long mask = (1UL << (nr & 63));
6038 + *m = (old & ~mask);
6039 + return ((old & mask) != 0);
6042 +static __inline__ int __test_and_change_bit(int nr, volatile unsigned long *addr)
6044 + volatile unsigned long *m = addr + (nr >> 6);
6046 + long mask = (1UL << (nr & 63));
6048 + *m = (old ^ mask);
6049 + return ((old & mask) != 0);
6052 #define smp_mb__before_clear_bit() do { } while(0)
6053 #define smp_mb__after_clear_bit() do { } while(0)
6055 -extern __inline__ int test_bit(int nr, __const__ void *addr)
6056 +static __inline__ int test_bit(int nr, __const__ volatile unsigned long *addr)
6058 - return (1UL & (((__const__ long *) addr)[nr >> 6] >> (nr & 63))) != 0UL;
6059 + return (1UL & ((addr)[nr >> 6] >> (nr & 63))) != 0UL;
6062 /* The easy/cheese version for now. */
6063 -extern __inline__ unsigned long ffz(unsigned long word)
6064 +static __inline__ unsigned long ffz(unsigned long word)
6066 unsigned long result;
6068 -#ifdef ULTRA_HAS_POPULATION_COUNT /* Thanks for nothing Sun... */
6069 - __asm__ __volatile__(
6072 -" xnor %0, %%g1, %%g2\n"
6074 -"1: " : "=&r" (result)
6078 -#if 1 /* def EASY_CHEESE_VERSION */
6085 - unsigned long tmp;
6090 - tmp = ~word & -~word;
6091 - if (!(unsigned)tmp) {
6095 - if (!(unsigned short)tmp) {
6099 - if (!(unsigned char)tmp) {
6103 + * __ffs - find first bit in word.
6104 + * @word: The word to search
6106 + * Undefined if no bit exists, so code should check against 0 first.
6108 +static __inline__ unsigned long __ffs(unsigned long word)
6110 + unsigned long result = 0;
6112 + while (!(word & 1UL)) {
6116 - if (tmp & 0xf0) result += 4;
6117 - if (tmp & 0xcc) result += 2;
6118 - if (tmp & 0xaa) result ++;
6125 + * fls: find last bit set.
6128 +#define fls(x) generic_fls(x)
6133 + * Every architecture must define this function. It's the fastest
6134 + * way of searching a 140-bit bitmap where the first 100 bits are
6135 + * unlikely to be set. It's guaranteed that at least one of the 140
6136 + * bits is cleared.
6138 +static inline int sched_find_first_bit(unsigned long *b)
6140 + if (unlikely(b[0]))
6141 + return __ffs(b[0]);
6142 + if (unlikely(((unsigned int)b[1])))
6143 + return __ffs(b[1]) + 64;
6145 + return __ffs(b[1] >> 32) + 96;
6146 + return __ffs(b[2]) + 128;
6150 * ffs: find first bit set. This is defined the same way as
6151 * the libc and compiler builtin ffs routines, therefore
6152 * differs in spirit from the above ffz (man ffs).
6155 -#define ffs(x) generic_ffs(x)
6156 +static __inline__ int ffs(int x)
6160 + return __ffs((unsigned long)x);
6164 * hweightN: returns the hamming weight (i.e. the number
6167 #ifdef ULTRA_HAS_POPULATION_COUNT
6169 -extern __inline__ unsigned int hweight32(unsigned int w)
6170 +static __inline__ unsigned int hweight32(unsigned int w)
6178 -extern __inline__ unsigned int hweight16(unsigned int w)
6179 +static __inline__ unsigned int hweight16(unsigned int w)
6187 -extern __inline__ unsigned int hweight8(unsigned int w)
6188 +static __inline__ unsigned int hweight8(unsigned int w)
6192 @@ -165,14 +189,69 @@
6194 #endif /* __KERNEL__ */
6197 + * find_next_bit - find the next set bit in a memory region
6198 + * @addr: The address to base the search on
6199 + * @offset: The bitnumber to start searching at
6200 + * @size: The maximum size to search
6202 +static __inline__ unsigned long find_next_bit(unsigned long *addr, unsigned long size, unsigned long offset)
6204 + unsigned long *p = addr + (offset >> 6);
6205 + unsigned long result = offset & ~63UL;
6206 + unsigned long tmp;
6208 + if (offset >= size)
6214 + tmp &= (~0UL << offset);
6218 + goto found_middle;
6222 + while (size & ~63UL) {
6223 + if ((tmp = *(p++)))
6224 + goto found_middle;
6233 + tmp &= (~0UL >> (64 - size));
6234 + if (tmp == 0UL) /* Are any bits set? */
6235 + return result + size; /* Nope. */
6237 + return result + __ffs(tmp);
6241 + * find_first_bit - find the first set bit in a memory region
6242 + * @addr: The address to start the search at
6243 + * @size: The maximum size to search
6245 + * Returns the bit-number of the first set bit, not the number of the byte
6246 + * containing a bit.
6248 +#define find_first_bit(addr, size) \
6249 + find_next_bit((addr), (size), 0)
6251 /* find_next_zero_bit() finds the first zero bit in a bit string of length
6252 * 'size' bits, starting the search at bit 'offset'. This is largely based
6253 * on Linus's ALPHA routines, which are pretty portable BTW.
6256 -extern __inline__ unsigned long find_next_zero_bit(void *addr, unsigned long size, unsigned long offset)
6257 +static __inline__ unsigned long find_next_zero_bit(unsigned long *addr, unsigned long size, unsigned long offset)
6259 - unsigned long *p = ((unsigned long *) addr) + (offset >> 6);
6260 + unsigned long *p = addr + (offset >> 6);
6261 unsigned long result = offset & ~63UL;
6264 @@ -211,15 +290,15 @@
6265 #define find_first_zero_bit(addr, size) \
6266 find_next_zero_bit((addr), (size), 0)
6268 -extern long ___test_and_set_le_bit(int nr, volatile void *addr);
6269 -extern long ___test_and_clear_le_bit(int nr, volatile void *addr);
6270 +extern long ___test_and_set_le_bit(int nr, volatile unsigned long *addr);
6271 +extern long ___test_and_clear_le_bit(int nr, volatile unsigned long *addr);
6273 #define test_and_set_le_bit(nr,addr) ({___test_and_set_le_bit(nr,addr)!=0;})
6274 #define test_and_clear_le_bit(nr,addr) ({___test_and_clear_le_bit(nr,addr)!=0;})
6275 #define set_le_bit(nr,addr) ((void)___test_and_set_le_bit(nr,addr))
6276 #define clear_le_bit(nr,addr) ((void)___test_and_clear_le_bit(nr,addr))
6278 -extern __inline__ int test_le_bit(int nr, __const__ void * addr)
6279 +static __inline__ int test_le_bit(int nr, __const__ unsigned long * addr)
6282 __const__ unsigned char *ADDR = (__const__ unsigned char *) addr;
6284 #define find_first_zero_le_bit(addr, size) \
6285 find_next_zero_le_bit((addr), (size), 0)
6287 -extern __inline__ unsigned long find_next_zero_le_bit(void *addr, unsigned long size, unsigned long offset)
6288 +static __inline__ unsigned long find_next_zero_le_bit(unsigned long *addr, unsigned long size, unsigned long offset)
6290 - unsigned long *p = ((unsigned long *) addr) + (offset >> 6);
6291 + unsigned long *p = addr + (offset >> 6);
6292 unsigned long result = offset & ~63UL;
6295 @@ -271,18 +350,22 @@
6299 -#define ext2_set_bit test_and_set_le_bit
6300 -#define ext2_clear_bit test_and_clear_le_bit
6301 -#define ext2_test_bit test_le_bit
6302 -#define ext2_find_first_zero_bit find_first_zero_le_bit
6303 -#define ext2_find_next_zero_bit find_next_zero_le_bit
6304 +#define ext2_set_bit(nr,addr) test_and_set_le_bit((nr),(unsigned long *)(addr))
6305 +#define ext2_clear_bit(nr,addr) test_and_clear_le_bit((nr),(unsigned long *)(addr))
6306 +#define ext2_test_bit(nr,addr) test_le_bit((nr),(unsigned long *)(addr))
6307 +#define ext2_find_first_zero_bit(addr, size) \
6308 + find_first_zero_le_bit((unsigned long *)(addr), (size))
6309 +#define ext2_find_next_zero_bit(addr, size, off) \
6310 + find_next_zero_le_bit((unsigned long *)(addr), (size), (off))
6312 /* Bitmap functions for the minix filesystem. */
6313 -#define minix_test_and_set_bit(nr,addr) test_and_set_bit(nr,addr)
6314 -#define minix_set_bit(nr,addr) set_bit(nr,addr)
6315 -#define minix_test_and_clear_bit(nr,addr) test_and_clear_bit(nr,addr)
6316 -#define minix_test_bit(nr,addr) test_bit(nr,addr)
6317 -#define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size)
6318 +#define minix_test_and_set_bit(nr,addr) test_and_set_bit((nr),(unsigned long *)(addr))
6319 +#define minix_set_bit(nr,addr) set_bit((nr),(unsigned long *)(addr))
6320 +#define minix_test_and_clear_bit(nr,addr) \
6321 + test_and_clear_bit((nr),(unsigned long *)(addr))
6322 +#define minix_test_bit(nr,addr) test_bit((nr),(unsigned long *)(addr))
6323 +#define minix_find_first_zero_bit(addr,size) \
6324 + find_first_zero_bit((unsigned long *)(addr),(size))
6326 #endif /* __KERNEL__ */
6328 diff -urN linux-2.4.24.org/include/asm-sparc64/smp.h linux-2.4.24/include/asm-sparc64/smp.h
6329 --- linux-2.4.24.org/include/asm-sparc64/smp.h 2004-02-04 20:48:01.767676594 +0100
6330 +++ linux-2.4.24/include/asm-sparc64/smp.h 2004-02-04 20:52:54.175856518 +0100
6335 -#define smp_processor_id() (current->processor)
6336 +#define smp_processor_id() (current->cpu)
6338 /* This needn't do anything as we do not sleep the cpu
6339 * inside of the idler task, so an interrupt is not needed
6340 diff -urN linux-2.4.24.org/include/asm-sparc64/system.h linux-2.4.24/include/asm-sparc64/system.h
6341 --- linux-2.4.24.org/include/asm-sparc64/system.h 2004-02-04 20:48:01.898649351 +0100
6342 +++ linux-2.4.24/include/asm-sparc64/system.h 2004-02-04 20:52:54.208849656 +0100
6343 @@ -154,7 +154,18 @@
6345 #define flush_user_windows flushw_user
6346 #define flush_register_windows flushw_all
6347 -#define prepare_to_switch flushw_all
6349 +#define prepare_arch_schedule(prev) task_lock(prev)
6350 +#define finish_arch_schedule(prev) task_unlock(prev)
6351 +#define prepare_arch_switch(rq, next) \
6352 +do { spin_lock(&(next)->switch_lock); \
6353 + spin_unlock(&(rq)->lock); \
6357 +#define finish_arch_switch(rq, prev) \
6358 +do { spin_unlock_irq(&(prev)->switch_lock); \
6361 #ifndef CONFIG_DEBUG_SPINLOCK
6362 #define CHECK_LOCKS(PREV) do { } while(0)
6363 diff -urN linux-2.4.24.org/include/linux/bitops.h linux-2.4.24/include/linux/bitops.h
6364 --- linux-2.4.24.org/include/linux/bitops.h 2004-02-04 20:47:38.725469391 +0100
6365 +++ linux-2.4.24/include/linux/bitops.h 2004-02-04 20:52:54.244842170 +0100
6367 #ifndef _LINUX_BITOPS_H
6368 #define _LINUX_BITOPS_H
6371 + * fls: find last bit set.
6374 +extern __inline__ int generic_fls(int x)
6380 + if (!(x & 0xffff0000u)) {
6384 + if (!(x & 0xff000000u)) {
6388 + if (!(x & 0xf0000000u)) {
6392 + if (!(x & 0xc0000000u)) {
6396 + if (!(x & 0x80000000u)) {
6404 * ffs: find first bit set. This is defined the same way as
6405 diff -urN linux-2.4.24.org/include/linux/kernel_stat.h linux-2.4.24/include/linux/kernel_stat.h
6406 --- linux-2.4.24.org/include/linux/kernel_stat.h 2004-02-04 20:47:34.063439098 +0100
6407 +++ linux-2.4.24/include/linux/kernel_stat.h 2004-02-04 20:52:54.297831148 +0100
6409 #elif !defined(CONFIG_ARCH_S390)
6410 unsigned int irqs[NR_CPUS][NR_IRQS];
6412 - unsigned int context_swtch;
6415 extern struct kernel_stat kstat;
6416 diff -urN linux-2.4.24.org/include/linux/sched.h linux-2.4.24/include/linux/sched.h
6417 --- linux-2.4.24.org/include/linux/sched.h 2004-02-04 20:47:32.755711107 +0100
6418 +++ linux-2.4.24/include/linux/sched.h 2004-02-04 20:52:54.755735907 +0100
6420 extern unsigned long event;
6422 #include <linux/config.h>
6423 +#include <linux/compiler.h>
6424 #include <linux/binfmts.h>
6425 #include <linux/threads.h>
6426 #include <linux/kernel.h>
6428 #include <asm/mmu.h>
6430 #include <linux/smp.h>
6431 -#include <linux/tty.h>
6432 +//#include <linux/tty.h>
6433 #include <linux/sem.h>
6434 #include <linux/signal.h>
6435 #include <linux/securebits.h>
6437 #define CT_TO_SECS(x) ((x) / HZ)
6438 #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ)
6440 -extern int nr_running, nr_threads;
6441 +extern int nr_threads;
6442 extern int last_pid;
6443 +extern unsigned long nr_running(void);
6444 +extern unsigned long nr_uninterruptible(void);
6446 -#include <linux/fs.h>
6447 +//#include <linux/fs.h>
6448 #include <linux/time.h>
6449 #include <linux/param.h>
6450 #include <linux/resource.h>
6451 @@ -109,12 +112,6 @@
6452 #define SCHED_FIFO 1
6456 - * This is an additional bit set when we want to
6457 - * yield the CPU for one re-schedule..
6459 -#define SCHED_YIELD 0x10
6461 struct sched_param {
6464 @@ -132,17 +129,21 @@
6467 extern rwlock_t tasklist_lock;
6468 -extern spinlock_t runqueue_lock;
6469 extern spinlock_t mmlist_lock;
6471 +typedef struct task_struct task_t;
6473 extern void sched_init(void);
6474 -extern void init_idle(void);
6475 +extern void init_idle(task_t *idle, int cpu);
6476 extern void show_state(void);
6477 extern void cpu_init (void);
6478 extern void trap_init(void);
6479 extern void update_process_times(int user);
6480 -extern void update_one_process(struct task_struct *p, unsigned long user,
6481 +extern void update_one_process(task_t *p, unsigned long user,
6482 unsigned long system, int cpu);
6483 +extern void scheduler_tick(int user_tick, int system);
6484 +extern void migration_init(void);
6485 +extern unsigned long cache_decay_ticks;
6487 #define MAX_SCHEDULE_TIMEOUT LONG_MAX
6488 extern signed long FASTCALL(schedule_timeout(signed long timeout));
6489 @@ -152,6 +153,28 @@
6490 extern void flush_scheduled_tasks(void);
6491 extern int start_context_thread(void);
6492 extern int current_is_keventd(void);
6493 +extern void FASTCALL(sched_exit(task_t * p));
6494 +extern int FASTCALL(idle_cpu(int cpu));
6497 + * Priority of a process goes from 0..MAX_PRIO-1, valid RT
6498 + * priority is 0..MAX_RT_PRIO-1, and SCHED_OTHER tasks are
6499 + * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values
6500 + * are inverted: lower p->prio value means higher priority.
6502 + * The MAX_RT_USER_PRIO value allows the actual maximum
6503 + * RT priority to be separate from the value exported to
6504 + * user-space. This allows kernel threads to set their
6505 + * priority to a value higher than any user task. Note:
6506 + * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
6508 + * Both values are configurable at compile-time.
6511 +#define MAX_USER_RT_PRIO 100
6512 +#define MAX_RT_PRIO MAX_USER_RT_PRIO
6514 +#define MAX_PRIO (MAX_RT_PRIO + 40)
6517 extern void set_cpus_allowed(struct task_struct *p, unsigned long new_mask);
6519 extern struct user_struct root_user;
6520 #define INIT_USER (&root_user)
6522 +typedef struct prio_array prio_array_t;
6524 struct task_struct {
6526 * offsets of these are hardcoded elsewhere - touch with care
6527 @@ -297,35 +322,26 @@
6529 int lock_depth; /* Lock depth */
6532 - * offset 32 begins here on 32-bit platforms. We keep
6533 - * all fields in a single cacheline that are needed for
6534 - * the goodness() loop in schedule().
6538 - unsigned long policy;
6539 - struct mm_struct *mm;
6542 - * cpus_runnable is ~0 if the process is not running on any
6543 - * CPU. It's (1 << cpu) if it's running on a CPU. This mask
6544 - * is updated under the runqueue lock.
6546 - * To determine whether a process might run on a CPU, this
6547 - * mask is AND-ed with cpus_allowed.
6549 - unsigned long cpus_runnable, cpus_allowed;
6551 - * (only the 'next' pointer fits into the cacheline, but
6552 - * that's just fine.)
6553 + * offset 32 begins here on 32-bit platforms.
6556 + int prio, static_prio;
6557 struct list_head run_list;
6558 - unsigned long sleep_time;
6559 + prio_array_t *array;
6561 - struct task_struct *next_task, *prev_task;
6562 - struct mm_struct *active_mm;
6563 + unsigned long sleep_avg;
6564 + unsigned long sleep_timestamp;
6566 + unsigned long policy;
6567 + unsigned long cpus_allowed;
6568 + unsigned int time_slice, first_time_slice;
6570 + task_t *next_task, *prev_task;
6572 + struct mm_struct *mm, *active_mm;
6573 struct list_head local_pages;
6575 unsigned int allocation_order, nr_local_pages;
6578 @@ -348,12 +364,12 @@
6579 * older sibling, respectively. (p->father can be replaced with
6582 - struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
6583 + task_t *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
6584 struct list_head thread_group;
6586 /* PID hash table linkage. */
6587 - struct task_struct *pidhash_next;
6588 - struct task_struct **pidhash_pprev;
6589 + task_t *pidhash_next;
6590 + task_t **pidhash_pprev;
6592 wait_queue_head_t wait_chldexit; /* for wait4() */
6593 struct completion *vfork_done; /* for vfork() */
6596 /* Protection of (de-)allocation: mm, files, fs, tty */
6597 spinlock_t alloc_lock;
6598 +/* context-switch lock */
6599 + spinlock_t switch_lock;
6601 /* journalling filesystem info */
6603 @@ -454,9 +472,15 @@
6605 #define _STK_LIM (8*1024*1024)
6607 -#define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */
6608 -#define MAX_COUNTER (20*HZ/100)
6609 -#define DEF_NICE (0)
6611 +extern void set_cpus_allowed(task_t *p, unsigned long new_mask);
6613 +#define set_cpus_allowed(p, new_mask) do { } while (0)
6616 +extern void set_user_nice(task_t *p, long nice);
6617 +extern int task_prio(task_t *p);
6618 +extern int task_nice(task_t *p);
6620 extern void yield(void);
6622 @@ -477,14 +501,14 @@
6623 addr_limit: KERNEL_DS, \
6624 exec_domain: &default_exec_domain, \
6626 - counter: DEF_COUNTER, \
6628 + prio: MAX_PRIO-20, \
6629 + static_prio: MAX_PRIO-20, \
6630 policy: SCHED_OTHER, \
6631 + cpus_allowed: ~0UL, \
6633 active_mm: &init_mm, \
6634 - cpus_runnable: ~0UL, \
6635 - cpus_allowed: ~0UL, \
6636 run_list: LIST_HEAD_INIT(tsk.run_list), \
6642 pending: { NULL, &tsk.pending.head, {{0}}}, \
6644 alloc_lock: SPIN_LOCK_UNLOCKED, \
6645 + switch_lock: SPIN_LOCK_UNLOCKED, \
6646 journal_info: NULL, \
6649 @@ -518,24 +543,23 @@
6653 - struct task_struct task;
6655 unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
6658 extern union task_union init_task_union;
6660 extern struct mm_struct init_mm;
6661 -extern struct task_struct *init_tasks[NR_CPUS];
6663 /* PID hashing. (shouldnt this be dynamic?) */
6664 #define PIDHASH_SZ (4096 >> 2)
6665 -extern struct task_struct *pidhash[PIDHASH_SZ];
6666 +extern task_t *pidhash[PIDHASH_SZ];
6668 #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
6670 -static inline void hash_pid(struct task_struct *p)
6671 +static inline void hash_pid(task_t *p)
6673 - struct task_struct **htable = &pidhash[pid_hashfn(p->pid)];
6674 + task_t **htable = &pidhash[pid_hashfn(p->pid)];
6676 if((p->pidhash_next = *htable) != NULL)
6677 (*htable)->pidhash_pprev = &p->pidhash_next;
6678 @@ -543,16 +567,16 @@
6679 p->pidhash_pprev = htable;
6682 -static inline void unhash_pid(struct task_struct *p)
6683 +static inline void unhash_pid(task_t *p)
6686 p->pidhash_next->pidhash_pprev = p->pidhash_pprev;
6687 *p->pidhash_pprev = p->pidhash_next;
6690 -static inline struct task_struct *find_task_by_pid(int pid)
6691 +static inline task_t *find_task_by_pid(int pid)
6693 - struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)];
6694 + task_t *p, **htable = &pidhash[pid_hashfn(pid)];
6696 for(p = *htable; p && p->pid != pid; p = p->pidhash_next)
6698 @@ -560,19 +584,6 @@
6702 -#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL)
6704 -static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu)
6706 - tsk->processor = cpu;
6707 - tsk->cpus_runnable = 1UL << cpu;
6710 -static inline void task_release_cpu(struct task_struct *tsk)
6712 - tsk->cpus_runnable = ~0UL;
6715 /* per-UID process charging. */
6716 extern struct user_struct * alloc_uid(uid_t);
6717 extern void free_uid(struct user_struct *);
6718 @@ -600,47 +611,50 @@
6719 extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
6720 extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
6721 signed long timeout));
6722 -extern int FASTCALL(wake_up_process(struct task_struct * tsk));
6723 +extern int FASTCALL(wake_up_process(task_t * p));
6724 +extern void FASTCALL(wake_up_forked_process(task_t * p));
6726 #define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
6727 #define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
6728 #define wake_up_all(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0)
6729 -#define wake_up_sync(x) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
6730 -#define wake_up_sync_nr(x, nr) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
6731 #define wake_up_interruptible(x) __wake_up((x),TASK_INTERRUPTIBLE, 1)
6732 #define wake_up_interruptible_nr(x, nr) __wake_up((x),TASK_INTERRUPTIBLE, nr)
6733 #define wake_up_interruptible_all(x) __wake_up((x),TASK_INTERRUPTIBLE, 0)
6734 -#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1)
6735 -#define wake_up_interruptible_sync_nr(x, nr) __wake_up_sync((x),TASK_INTERRUPTIBLE, nr)
6737 +#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1)
6739 +#define wake_up_interruptible_sync(x) __wake_up((x),TASK_INTERRUPTIBLE, 1)
6742 asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru);
6744 extern int in_group_p(gid_t);
6745 extern int in_egroup_p(gid_t);
6747 extern void proc_caches_init(void);
6748 -extern void flush_signals(struct task_struct *);
6749 -extern void flush_signal_handlers(struct task_struct *);
6750 +extern void flush_signals(task_t *);
6751 +extern void flush_signal_handlers(task_t *);
6752 extern void sig_exit(int, int, struct siginfo *);
6753 extern int dequeue_signal(sigset_t *, siginfo_t *);
6754 extern void block_all_signals(int (*notifier)(void *priv), void *priv,
6756 extern void unblock_all_signals(void);
6757 -extern int send_sig_info(int, struct siginfo *, struct task_struct *);
6758 -extern int force_sig_info(int, struct siginfo *, struct task_struct *);
6759 +extern int send_sig_info(int, struct siginfo *, task_t *);
6760 +extern int force_sig_info(int, struct siginfo *, task_t *);
6761 extern int kill_pg_info(int, struct siginfo *, pid_t);
6762 extern int kill_sl_info(int, struct siginfo *, pid_t);
6763 extern int kill_proc_info(int, struct siginfo *, pid_t);
6764 -extern void notify_parent(struct task_struct *, int);
6765 -extern void do_notify_parent(struct task_struct *, int);
6766 -extern void force_sig(int, struct task_struct *);
6767 -extern int send_sig(int, struct task_struct *, int);
6768 +extern void notify_parent(task_t *, int);
6769 +extern void do_notify_parent(task_t *, int);
6770 +extern void force_sig(int, task_t *);
6771 +extern int send_sig(int, task_t *, int);
6772 extern int kill_pg(pid_t, int, int);
6773 extern int kill_sl(pid_t, int, int);
6774 extern int kill_proc(pid_t, int, int);
6775 extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *);
6776 extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long);
6778 -static inline int signal_pending(struct task_struct *p)
6779 +static inline int signal_pending(task_t *p)
6781 return (p->sigpending != 0);
6784 This is required every time the blocked sigset_t changes.
6785 All callers should have t->sigmask_lock. */
6787 -static inline void recalc_sigpending(struct task_struct *t)
6788 +static inline void recalc_sigpending(task_t *t)
6790 t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
6792 @@ -786,16 +800,17 @@
6793 extern int expand_fdset(struct files_struct *, int nr);
6794 extern void free_fdset(fd_set *, int);
6796 -extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
6797 +extern int copy_thread(int, unsigned long, unsigned long, unsigned long, task_t *, struct pt_regs *);
6798 extern void flush_thread(void);
6799 extern void exit_thread(void);
6801 -extern void exit_mm(struct task_struct *);
6802 -extern void exit_files(struct task_struct *);
6803 -extern void exit_sighand(struct task_struct *);
6804 +extern void exit_mm(task_t *);
6805 +extern void exit_files(task_t *);
6806 +extern void exit_sighand(task_t *);
6808 extern void reparent_to_init(void);
6809 extern void daemonize(void);
6810 +extern task_t *child_reaper;
6812 extern int do_execve(char *, char **, char **, struct pt_regs *);
6813 extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long);
6816 extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
6818 +extern void wait_task_inactive(task_t * p);
6819 +extern void kick_if_running(task_t * p);
6821 #define __wait_event(wq, condition) \
6823 wait_queue_t __wait; \
6824 @@ -887,27 +905,12 @@
6825 for (task = next_thread(current) ; task != current ; task = next_thread(task))
6827 #define next_thread(p) \
6828 - list_entry((p)->thread_group.next, struct task_struct, thread_group)
6829 + list_entry((p)->thread_group.next, task_t, thread_group)
6831 #define thread_group_leader(p) (p->pid == p->tgid)
6833 -static inline void del_from_runqueue(struct task_struct * p)
6834 +static inline void unhash_process(task_t *p)
6837 - p->sleep_time = jiffies;
6838 - list_del(&p->run_list);
6839 - p->run_list.next = NULL;
6842 -static inline int task_on_runqueue(struct task_struct *p)
6844 - return (p->run_list.next != NULL);
6847 -static inline void unhash_process(struct task_struct *p)
6849 - if (task_on_runqueue(p))
6850 - out_of_line_bug();
6851 write_lock_irq(&tasklist_lock);
6854 @@ -917,12 +920,12 @@
6857 /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */
6858 -static inline void task_lock(struct task_struct *p)
6859 +static inline void task_lock(task_t *p)
6861 spin_lock(&p->alloc_lock);
6864 -static inline void task_unlock(struct task_struct *p)
6865 +static inline void task_unlock(task_t *p)
6867 spin_unlock(&p->alloc_lock);
6869 @@ -946,6 +949,26 @@
6873 +static inline void set_need_resched(void)
6875 + current->need_resched = 1;
6878 +static inline void clear_need_resched(void)
6880 + current->need_resched = 0;
6883 +static inline void set_tsk_need_resched(task_t *tsk)
6885 + tsk->need_resched = 1;
6888 +static inline void clear_tsk_need_resched(task_t *tsk)
6890 + tsk->need_resched = 0;
6893 static inline int need_resched(void)
6895 return (unlikely(current->need_resched));
6899 #endif /* __KERNEL__ */
6902 diff -urN linux-2.4.24.org/include/linux/smp_balance.h linux-2.4.24/include/linux/smp_balance.h
6903 --- linux-2.4.24.org/include/linux/smp_balance.h 1970-01-01 01:00:00.000000000 +0100
6904 +++ linux-2.4.24/include/linux/smp_balance.h 2004-02-04 20:52:54.758735283 +0100
6906 +#ifndef _LINUX_SMP_BALANCE_H
6907 +#define _LINUX_SMP_BALANCE_H
6910 + * per-architecture load balancing logic, e.g. for hyperthreading
6913 +#ifdef ARCH_HAS_SMP_BALANCE
6914 +#include <asm/smp_balance.h>
6916 +#define arch_load_balance(x, y) (0)
6917 +#define arch_reschedule_idle_override(x, idle) (idle)
6920 +#endif /* _LINUX_SMP_BALANCE_H */
6921 diff -urN linux-2.4.24.org/include/linux/smp.h linux-2.4.24/include/linux/smp.h
6922 --- linux-2.4.24.org/include/linux/smp.h 2004-02-04 20:47:38.184581896 +0100
6923 +++ linux-2.4.24/include/linux/smp.h 2004-02-04 20:52:54.806725301 +0100
6925 #define cpu_number_map(cpu) 0
6926 #define smp_call_function(func,info,retry,wait) ({ 0; })
6927 #define cpu_online_map 1
6928 +static inline void smp_send_reschedule(int cpu) { }
6929 +static inline void smp_send_reschedule_all(void) { }
6934 + * Common definitions:
6936 +#define cpu() smp_processor_id()
6939 diff -urN linux-2.4.24.org/include/linux/wait.h linux-2.4.24/include/linux/wait.h
6940 --- linux-2.4.24.org/include/linux/wait.h 2004-02-04 20:47:33.472562001 +0100
6941 +++ linux-2.4.24/include/linux/wait.h 2004-02-04 20:52:54.861713864 +0100
6943 # define wq_write_lock_irq write_lock_irq
6944 # define wq_write_lock_irqsave write_lock_irqsave
6945 # define wq_write_unlock_irqrestore write_unlock_irqrestore
6946 +# define wq_write_unlock_irq write_unlock_irq
6947 # define wq_write_unlock write_unlock
6949 # define wq_lock_t spinlock_t
6951 # define wq_write_lock_irq spin_lock_irq
6952 # define wq_write_lock_irqsave spin_lock_irqsave
6953 # define wq_write_unlock_irqrestore spin_unlock_irqrestore
6954 +# define wq_write_unlock_irq spin_unlock_irq
6955 # define wq_write_unlock spin_unlock
6958 diff -urN linux-2.4.24.org/init/main.c linux-2.4.24/init/main.c
6959 --- linux-2.4.24.org/init/main.c 2004-02-04 20:47:26.630985058 +0100
6960 +++ linux-2.4.24/init/main.c 2004-02-04 20:52:54.909703882 +0100
6962 extern void setup_arch(char **);
6963 extern void cpu_idle(void);
6965 -unsigned long wait_init_idle;
6969 #ifdef CONFIG_X86_LOCAL_APIC
6970 @@ -303,34 +301,24 @@
6971 APIC_init_uniprocessor();
6974 -#define smp_init() do { } while (0)
6975 +#define smp_init() do { } while (0)
6981 /* Called by boot processor to activate the rest. */
6982 static void __init smp_init(void)
6984 /* Get other processors into their bootup holding patterns. */
6986 - wait_init_idle = cpu_online_map;
6987 - clear_bit(current->processor, &wait_init_idle); /* Don't wait on me! */
6989 smp_threads_ready=1;
6992 - /* Wait for the other cpus to set up their idle processes */
6993 - printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle);
6994 - while (wait_init_idle) {
6998 - printk("All processors have done init_idle\n");
7005 * We need to finalize in a non-__init function or else race conditions
7006 * between the root thread and the init thread may cause start_kernel to
7009 kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
7011 - current->need_resched = 1;
7018 * Activate the first processor.
7021 printk("POSIX conformance testing by UNIFIX\n");
7023 + init_idle(current, smp_processor_id());
7025 * We count on the initial thread going ok
7026 * Like idlers init is an unlocked kernel thread, which will
7027 @@ -465,6 +453,10 @@
7029 static void __init do_basic_setup(void)
7031 + /* Start the per-CPU migration threads */
7037 * Tell the world that we're going to be the grim
7038 diff -urN linux-2.4.24.org/kernel/capability.c linux-2.4.24/kernel/capability.c
7039 --- linux-2.4.24.org/kernel/capability.c 2004-02-04 20:47:27.302845310 +0100
7040 +++ linux-2.4.24/kernel/capability.c 2004-02-04 20:52:54.945696396 +0100
7042 #include <linux/mm.h>
7043 #include <asm/uaccess.h>
7045 +unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
7047 kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
7049 /* Note: never hold tasklist_lock while spinning for this one */
7050 diff -urN linux-2.4.24.org/kernel/exit.c linux-2.4.24/kernel/exit.c
7051 --- linux-2.4.24.org/kernel/exit.c 2004-02-04 20:47:27.240858204 +0100
7052 +++ linux-2.4.24/kernel/exit.c 2004-02-04 20:52:54.951695148 +0100
7055 static void release_task(struct task_struct * p)
7057 - if (p != current) {
7062 - * Wait to make sure the process isn't on the
7063 - * runqueue (active on some other CPU still)
7067 - if (!task_has_cpu(p))
7073 - } while (task_has_cpu(p));
7076 + wait_task_inactive(p);
7078 - atomic_dec(&p->user->processes);
7079 - free_uid(p->user);
7080 - unhash_process(p);
7082 - release_thread(p);
7083 - current->cmin_flt += p->min_flt + p->cmin_flt;
7084 - current->cmaj_flt += p->maj_flt + p->cmaj_flt;
7085 - current->cnswap += p->nswap + p->cnswap;
7087 - * Potentially available timeslices are retrieved
7088 - * here - this way the parent does not get penalized
7089 - * for creating too many processes.
7091 - * (this cannot be used to artificially 'generate'
7092 - * timeslices, because any timeslice recovered here
7093 - * was given away by the parent in the first place.)
7095 - current->counter += p->counter;
7096 - if (current->counter >= MAX_COUNTER)
7097 - current->counter = MAX_COUNTER;
7099 - free_task_struct(p);
7101 - printk("task releasing itself\n");
7103 + atomic_dec(&p->user->processes);
7104 + free_uid(p->user);
7105 + unhash_process(p);
7107 + release_thread(p);
7108 + current->cmin_flt += p->min_flt + p->cmin_flt;
7109 + current->cmaj_flt += p->maj_flt + p->cmaj_flt;
7110 + current->cnswap += p->nswap + p->cnswap;
7113 + free_task_struct(p);
7117 @@ -150,6 +123,79 @@
7122 + * reparent_to_init() - Reparent the calling kernel thread to the init task.
7124 + * If a kernel thread is launched as a result of a system call, or if
7125 + * it ever exits, it should generally reparent itself to init so that
7126 + * it is correctly cleaned up on exit.
7128 + * The various task state such as scheduling policy and priority may have
7129 + * been inherited from a user process, so we reset them to sane values here.
7131 + * NOTE that reparent_to_init() gives the caller full capabilities.
7133 +void reparent_to_init(void)
7135 + write_lock_irq(&tasklist_lock);
7137 + /* Reparent to init */
7138 + REMOVE_LINKS(current);
7139 + current->p_pptr = child_reaper;
7140 + current->p_opptr = child_reaper;
7141 + SET_LINKS(current);
7143 + /* Set the exit signal to SIGCHLD so we signal init on exit */
7144 + current->exit_signal = SIGCHLD;
7146 + current->ptrace = 0;
7147 + if ((current->policy == SCHED_OTHER) && (task_nice(current) < 0))
7148 + set_user_nice(current, 0);
7149 + /* cpus_allowed? */
7150 + /* rt_priority? */
7152 + current->cap_effective = CAP_INIT_EFF_SET;
7153 + current->cap_inheritable = CAP_INIT_INH_SET;
7154 + current->cap_permitted = CAP_FULL_SET;
7155 + current->keep_capabilities = 0;
7156 + memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
7157 + current->user = INIT_USER;
7159 + write_unlock_irq(&tasklist_lock);
7163 + * Put all the gunge required to become a kernel thread without
7164 + * attached user resources in one place where it belongs.
7167 +void daemonize(void)
7169 + struct fs_struct *fs;
7173 + * If we were started as result of loading a module, close all of the
7174 + * user space pages. We don't need them, and if we didn't close them
7175 + * they would be locked into memory.
7179 + current->session = 1;
7180 + current->pgrp = 1;
7181 + current->tty = NULL;
7183 + /* Become as one with the init task */
7185 + exit_fs(current); /* current->fs->count--; */
7186 + fs = init_task.fs;
7188 + atomic_inc(&fs->count);
7189 + exit_files(current);
7190 + current->files = init_task.files;
7191 + atomic_inc(¤t->files->count);
7195 * When we die, we re-parent all our children.
7196 * Try to give them to another thread in our thread
7198 /* Make sure we're not reparenting to ourselves */
7199 p->p_opptr = child_reaper;
7201 + p->first_time_slice = 0;
7202 if (p->pdeath_signal) send_sig(p->pdeath_signal, p, 0);
7205 diff -urN linux-2.4.24.org/kernel/fork.c linux-2.4.24/kernel/fork.c
7206 --- linux-2.4.24.org/kernel/fork.c 2004-02-04 20:47:26.750960103 +0100
7207 +++ linux-2.4.24/kernel/fork.c 2004-02-04 20:52:54.987687662 +0100
7210 /* The idle threads do not count.. */
7215 unsigned long total_forks; /* Handle normal Linux uptimes. */
7218 struct task_struct *pidhash[PIDHASH_SZ];
7220 +rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */
7222 void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
7224 unsigned long flags;
7226 if (p->pid == 0 && current->pid != 0)
7227 goto bad_fork_cleanup;
7229 - p->run_list.next = NULL;
7230 - p->run_list.prev = NULL;
7233 init_waitqueue_head(&p->wait_chldexit);
7234 p->vfork_done = NULL;
7236 init_completion(&vfork);
7238 spin_lock_init(&p->alloc_lock);
7239 + spin_lock_init(&p->switch_lock);
7242 init_sigpending(&p->pending);
7243 @@ -727,11 +726,11 @@
7247 - p->cpus_runnable = ~0UL;
7248 - p->processor = current->processor;
7250 /* ?? should we just memset this ?? */
7251 for(i = 0; i < smp_num_cpus; i++)
7252 - p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
7253 + p->per_cpu_utime[cpu_logical_map(i)] =
7254 + p->per_cpu_stime[cpu_logical_map(i)] = 0;
7255 spin_lock_init(&p->sigmask_lock);
7258 @@ -769,15 +768,27 @@
7259 p->pdeath_signal = 0;
7262 - * "share" dynamic priority between parent and child, thus the
7263 - * total amount of dynamic priorities in the system doesn't change,
7264 - * more scheduling fairness. This is only important in the first
7265 - * timeslice, on the long run the scheduling behaviour is unchanged.
7267 - p->counter = (current->counter + 1) >> 1;
7268 - current->counter >>= 1;
7269 - if (!current->counter)
7270 - current->need_resched = 1;
7271 + * Share the timeslice between parent and child, thus the
7272 + * total amount of pending timeslices in the system doesnt change,
7273 + * resulting in more scheduling fairness.
7276 + if (!current->time_slice)
7278 + p->time_slice = (current->time_slice + 1) >> 1;
7279 + current->time_slice >>= 1;
7280 + p->first_time_slice = 1;
7281 + if (!current->time_slice) {
7283 + * This case is rare, it happens when the parent has only
7284 + * a single jiffy left from its timeslice. Taking the
7285 + * runqueue lock is not a problem.
7287 + current->time_slice = 1;
7288 + scheduler_tick(0,0);
7290 + p->sleep_timestamp = jiffies;
7294 * Ok, add it to the run-queues and make it
7295 @@ -813,11 +824,16 @@
7297 if (p->ptrace & PT_PTRACED)
7298 send_sig(SIGSTOP, p, 1);
7300 - wake_up_process(p); /* do this last */
7301 + wake_up_forked_process(p); /* do this last */
7303 if (clone_flags & CLONE_VFORK)
7304 wait_for_completion(&vfork);
7307 + * Let the child process run first, to avoid most of the
7308 + * COW overhead when the child exec()s afterwards.
7310 + current->need_resched = 1;
7314 diff -urN linux-2.4.24.org/kernel/ksyms.c linux-2.4.24/kernel/ksyms.c
7315 --- linux-2.4.24.org/kernel/ksyms.c 2004-02-04 20:47:26.747960727 +0100
7316 +++ linux-2.4.24/kernel/ksyms.c 2004-02-04 20:52:54.992686623 +0100
7318 /* process management */
7319 EXPORT_SYMBOL(complete_and_exit);
7320 EXPORT_SYMBOL(__wake_up);
7321 -EXPORT_SYMBOL(__wake_up_sync);
7322 EXPORT_SYMBOL(wake_up_process);
7323 EXPORT_SYMBOL(sleep_on);
7324 EXPORT_SYMBOL(sleep_on_timeout);
7327 EXPORT_SYMBOL(yield);
7328 EXPORT_SYMBOL(__cond_resched);
7329 +EXPORT_SYMBOL(set_user_nice);
7330 +EXPORT_SYMBOL(nr_context_switches);
7331 EXPORT_SYMBOL(jiffies);
7332 EXPORT_SYMBOL(xtime);
7333 EXPORT_SYMBOL(do_gettimeofday);
7337 EXPORT_SYMBOL(kstat);
7338 -EXPORT_SYMBOL(nr_running);
7341 EXPORT_SYMBOL(panic);
7342 diff -urN linux-2.4.24.org/kernel/printk.c linux-2.4.24/kernel/printk.c
7343 --- linux-2.4.24.org/kernel/printk.c 2004-02-04 20:47:26.744961351 +0100
7344 +++ linux-2.4.24/kernel/printk.c 2004-02-04 20:52:55.015681840 +0100
7346 #include <linux/module.h>
7347 #include <linux/interrupt.h> /* For in_interrupt() */
7348 #include <linux/config.h>
7349 +#include <linux/delay.h>
7351 #include <asm/uaccess.h>
7353 diff -urN linux-2.4.24.org/kernel/ptrace.c linux-2.4.24/kernel/ptrace.c
7354 --- linux-2.4.24.org/kernel/ptrace.c 2004-02-04 20:47:26.776954696 +0100
7355 +++ linux-2.4.24/kernel/ptrace.c 2004-02-04 20:52:55.029678928 +0100
7357 if (child->state != TASK_STOPPED)
7360 - /* Make sure the child gets off its CPU.. */
7363 - if (!task_has_cpu(child))
7365 - task_unlock(child);
7367 - if (child->state != TASK_STOPPED)
7371 - } while (task_has_cpu(child));
7373 - task_unlock(child);
7374 + wait_task_inactive(child);
7378 diff -urN linux-2.4.24.org/kernel/sched.c linux-2.4.24/kernel/sched.c
7379 --- linux-2.4.24.org/kernel/sched.c 2004-02-04 20:47:26.741961975 +0100
7380 +++ linux-2.4.24/kernel/sched.c 2004-02-04 20:52:55.076669155 +0100
7383 * Kernel scheduler and related syscalls
7385 - * Copyright (C) 1991, 1992 Linus Torvalds
7386 + * Copyright (C) 1991-2002 Linus Torvalds
7388 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
7389 * make semaphores SMP safe
7390 * 1998-11-19 Implemented schedule_timeout() and related stuff
7391 * by Andrea Arcangeli
7392 - * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar
7393 + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
7394 + * hybrid priority-list and round-robin design with
7395 + * an array-switch method of distributing timeslices
7396 + * and per-CPU runqueues. Additional code by Davide
7397 + * Libenzi, Robert Love, and Rusty Russell.
7401 - * 'sched.c' is the main kernel file. It contains scheduling primitives
7402 - * (sleep_on, wakeup, schedule etc) as well as a number of simple system
7403 - * call functions (type getpid()), which just extract a field from
7407 -#include <linux/config.h>
7408 #include <linux/mm.h>
7409 -#include <linux/init.h>
7410 -#include <linux/smp_lock.h>
7411 #include <linux/nmi.h>
7412 #include <linux/interrupt.h>
7413 -#include <linux/kernel_stat.h>
7414 -#include <linux/completion.h>
7415 -#include <linux/prefetch.h>
7416 -#include <linux/compiler.h>
7418 +#include <linux/init.h>
7419 #include <asm/uaccess.h>
7420 +#include <linux/smp_lock.h>
7421 #include <asm/mmu_context.h>
7423 -extern void timer_bh(void);
7424 -extern void tqueue_bh(void);
7425 -extern void immediate_bh(void);
7426 +#include <linux/kernel_stat.h>
7427 +#include <linux/completion.h>
7430 - * scheduler variables
7432 + * Convert user-nice values [ -20 ... 0 ... 19 ]
7433 + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
7436 +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
7437 +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
7438 +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
7440 -unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
7442 -extern void mem_use(void);
7444 + * 'User priority' is the nice value converted to something we
7445 + * can work with better when scaling various scheduler parameters,
7446 + * it's a [ 0 ... 39 ] range.
7448 +#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
7449 +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
7450 +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
7453 - * Scheduling quanta.
7454 + * These are the 'tuning knobs' of the scheduler:
7456 - * NOTE! The unix "nice" value influences how long a process
7457 - * gets. The nice value ranges from -20 to +19, where a -20
7458 - * is a "high-priority" task, and a "+10" is a low-priority
7461 - * We want the time-slice to be around 50ms or so, so this
7462 - * calculation depends on the value of HZ.
7465 -#define TICK_SCALE(x) ((x) >> 2)
7467 -#define TICK_SCALE(x) ((x) >> 1)
7469 -#define TICK_SCALE(x) (x)
7471 -#define TICK_SCALE(x) ((x) << 1)
7473 -#define TICK_SCALE(x) ((x) << 2)
7476 -#define NICE_TO_TICKS(nice) (TICK_SCALE(20-(nice))+1)
7478 + * Minimum timeslice is 10 msecs, default timeslice is 150 msecs,
7479 + * maximum timeslice is 300 msecs. Timeslices get refilled after
7482 +#define MIN_TIMESLICE ( 10 * HZ / 1000)
7483 +#define MAX_TIMESLICE (300 * HZ / 1000)
7484 +#define CHILD_PENALTY 50
7485 +#define PARENT_PENALTY 100
7486 +#define PRIO_BONUS_RATIO 25
7487 +#define INTERACTIVE_DELTA 2
7488 +#define MAX_SLEEP_AVG (2*HZ)
7489 +#define STARVATION_LIMIT (2*HZ)
7492 - * Init task must be ok at boot for the ix86 as we will check its signals
7493 - * via the SMP irq return path.
7496 -struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
7497 + * If a task is 'interactive' then we reinsert it in the active
7498 + * array after it has expired its current timeslice. (it will not
7499 + * continue to run immediately, it will still roundrobin with
7500 + * other interactive tasks.)
7502 + * This part scales the interactivity limit depending on niceness.
7504 + * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
7505 + * Here are a few examples of different nice levels:
7507 + * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
7508 + * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
7509 + * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
7510 + * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
7511 + * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
7513 + * (the X axis represents the possible -5 ... 0 ... +5 dynamic
7514 + * priority range a task can explore, a value of '1' means the
7515 + * task is rated interactive.)
7517 + * Ie. nice +19 tasks can never get 'interactive' enough to be
7518 + * reinserted into the active array. And only heavily CPU-hog nice -20
7519 + * tasks will be expired. Default nice 0 tasks are somewhere between,
7520 + * it takes some effort for them to get interactive, but it's not
7524 +#define SCALE(v1,v1_max,v2_max) \
7525 + (v1) * (v2_max) / (v1_max)
7528 + (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
7529 + INTERACTIVE_DELTA)
7531 +#define TASK_INTERACTIVE(p) \
7532 + ((p)->prio <= (p)->static_prio - DELTA(p))
7535 - * The tasklist_lock protects the linked list of processes.
7537 - * The runqueue_lock locks the parts that actually access
7538 - * and change the run-queues, and have to be interrupt-safe.
7540 - * If both locks are to be concurrently held, the runqueue_lock
7541 - * nests inside the tasklist_lock.
7542 + * TASK_TIMESLICE scales user-nice values [ -20 ... 19 ]
7543 + * to time slice values.
7545 - * task->alloc_lock nests inside tasklist_lock.
7546 + * The higher a process's priority, the bigger timeslices
7547 + * it gets during one round of execution. But even the lowest
7548 + * priority process gets MIN_TIMESLICE worth of execution time.
7550 -spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */
7551 -rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */
7553 -static LIST_HEAD(runqueue_head);
7554 +#define TASK_TIMESLICE(p) (MIN_TIMESLICE + \
7555 + ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/39))
7558 - * We align per-CPU scheduling data on cacheline boundaries,
7559 - * to prevent cacheline ping-pong.
7560 + * These are the runqueue data structures:
7563 - struct schedule_data {
7564 - struct task_struct * curr;
7565 - cycles_t last_schedule;
7567 - char __pad [SMP_CACHE_BYTES];
7568 -} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
7570 -#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
7571 -#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
7572 +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
7574 -struct kernel_stat kstat;
7575 -extern struct task_struct *child_reaper;
7576 +typedef struct runqueue runqueue_t;
7579 +struct prio_array {
7581 + unsigned long bitmap[BITMAP_SIZE];
7582 + struct list_head queue[MAX_PRIO];
7585 -#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
7586 -#define can_schedule(p,cpu) \
7587 - ((p)->cpus_runnable & (p)->cpus_allowed & (1UL << cpu))
7589 + * This is the main, per-CPU runqueue data structure.
7591 + * Locking rule: those places that want to lock multiple runqueues
7592 + * (such as the load balancing or the process migration code), lock
7593 + * acquire operations must be ordered by ascending &runqueue.
7597 + unsigned long nr_running, nr_switches, expired_timestamp;
7598 + task_t *curr, *idle;
7599 + prio_array_t *active, *expired, arrays[2];
7600 + long nr_uninterruptible;
7603 + int prev_nr_running[NR_CPUS];
7604 + task_t *migration_thread;
7605 + struct list_head migration_queue;
7607 +} ____cacheline_aligned;
7610 +static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
7612 -#define idle_task(cpu) (&init_task)
7613 -#define can_schedule(p,cpu) (1)
7614 +#define cpu_rq(cpu) (runqueues + (cpu))
7615 +#define this_rq() cpu_rq(smp_processor_id())
7616 +#define task_rq(p) cpu_rq((p)->cpu)
7617 +#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
7618 +#define rt_task(p) ((p)->prio < MAX_RT_PRIO)
7621 + * Default context-switch locking:
7623 +#ifndef prepare_arch_switch
7624 +# define prepare_arch_switch(rq, next) do { } while(0)
7625 +# define finish_arch_switch(rq, prev) spin_unlock_irq(&(rq)->lock)
7628 -void scheduling_functions_start_here(void) { }
7631 - * This is the function that decides how desirable a process is..
7632 - * You can weigh different processes against each other depending
7633 - * on what CPU they've run on lately etc to try to handle cache
7634 - * and TLB miss penalties.
7637 - * -1000: never select this
7638 - * 0: out of time, recalculate counters (but it might still be
7640 - * +ve: "goodness" value (the larger, the better)
7641 - * +1000: realtime process, select this.
7642 + * task_rq_lock - lock the runqueue a given task resides on and disable
7643 + * interrupts. Note the ordering: we can safely lookup the task_rq without
7644 + * explicitly disabling preemption.
7647 -static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
7648 +static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
7653 - * select the current process after every other
7654 - * runnable process, but before the idle thread.
7655 - * Also, dont trigger a counter recalculation.
7658 - if (p->policy & SCHED_YIELD)
7660 + struct runqueue *rq;
7663 - * Non-RT process - normal case first.
7665 - if (p->policy == SCHED_OTHER) {
7667 - * Give the process a first-approximation goodness value
7668 - * according to the number of clock-ticks it has left.
7670 - * Don't do any other calculations if the time slice is
7673 - weight = p->counter;
7678 - /* Give a largish advantage to the same processor... */
7679 - /* (this is equivalent to penalizing other processors) */
7680 - if (p->processor == this_cpu)
7681 - weight += PROC_CHANGE_PENALTY;
7684 - /* .. and a slight advantage to the current MM */
7685 - if (p->mm == this_mm || !p->mm)
7687 - weight += 20 - p->nice;
7691 + spin_lock_irqsave(&rq->lock, *flags);
7692 + if (unlikely(rq != task_rq(p))) {
7693 + spin_unlock_irqrestore(&rq->lock, *flags);
7694 + goto repeat_lock_task;
7700 - * Realtime process, select the first one on the
7701 - * runqueue (taking priorities within processes
7704 - weight = 1000 + p->rt_priority;
7707 +static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
7709 + spin_unlock_irqrestore(&rq->lock, *flags);
7713 - * the 'goodness value' of replacing a process on a given CPU.
7714 - * positive value means 'replace', zero or negative means 'dont'.
7715 + * Adding/removing a task to/from a priority array:
7717 -static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
7718 +static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
7720 - return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
7721 + array->nr_active--;
7722 + list_del(&p->run_list);
7723 + if (list_empty(array->queue + p->prio))
7724 + __clear_bit(p->prio, array->bitmap);
7728 - * This is ugly, but reschedule_idle() is very timing-critical.
7729 - * We are called with the runqueue spinlock held and we must
7730 - * not claim the tasklist_lock.
7732 -static FASTCALL(void reschedule_idle(struct task_struct * p));
7733 +#define enqueue_task(p, array) __enqueue_task(p, array, NULL)
7734 +static inline void __enqueue_task(struct task_struct *p, prio_array_t *array, task_t * parent)
7737 + list_add_tail(&p->run_list, array->queue + p->prio);
7738 + __set_bit(p->prio, array->bitmap);
7741 + list_add_tail(&p->run_list, &parent->run_list);
7742 + array = p->array = parent->array;
7744 + array->nr_active++;
7747 -static void fastcall reschedule_idle(struct task_struct * p)
7748 +static inline int effective_prio(task_t *p)
7751 - int this_cpu = smp_processor_id();
7752 - struct task_struct *tsk, *target_tsk;
7753 - int cpu, best_cpu, i, max_prio;
7754 - cycles_t oldest_idle;
7758 - * shortcut if the woken up task's last CPU is
7760 + * Here we scale the actual sleep average [0 .... MAX_SLEEP_AVG]
7761 + * into the -5 ... 0 ... +5 bonus/penalty range.
7763 + * We use 25% of the full 0...39 priority range so that:
7765 + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
7766 + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
7768 + * Both properties are important to certain workloads.
7770 - best_cpu = p->processor;
7771 - if (can_schedule(p, best_cpu)) {
7772 - tsk = idle_task(best_cpu);
7773 - if (cpu_curr(best_cpu) == tsk) {
7777 - * If need_resched == -1 then we can skip sending
7778 - * the IPI altogether, tsk->need_resched is
7779 - * actively watched by the idle thread.
7781 - need_resched = tsk->need_resched;
7782 - tsk->need_resched = 1;
7783 - if ((best_cpu != this_cpu) && !need_resched)
7784 - smp_send_reschedule(best_cpu);
7788 + bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
7789 + MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
7792 - * We know that the preferred CPU has a cache-affine current
7793 - * process, lets try to find a new idle CPU for the woken-up
7794 - * process. Select the least recently active idle CPU. (that
7795 - * one will have the least active cache context.) Also find
7796 - * the executing process which has the least priority.
7798 - oldest_idle = (cycles_t) -1;
7799 - target_tsk = NULL;
7801 + prio = p->static_prio - bonus;
7802 + if (prio < MAX_RT_PRIO)
7803 + prio = MAX_RT_PRIO;
7804 + if (prio > MAX_PRIO-1)
7805 + prio = MAX_PRIO-1;
7809 - for (i = 0; i < smp_num_cpus; i++) {
7810 - cpu = cpu_logical_map(i);
7811 - if (!can_schedule(p, cpu))
7813 - tsk = cpu_curr(cpu);
7814 +#define activate_task(p, rq) __activate_task(p, rq, NULL)
7815 +static inline void __activate_task(task_t *p, runqueue_t *rq, task_t * parent)
7817 + unsigned long sleep_time = jiffies - p->sleep_timestamp;
7818 + prio_array_t *array = rq->active;
7820 + if (!parent && !rt_task(p) && sleep_time) {
7822 - * We use the first available idle CPU. This creates
7823 - * a priority list between idle CPUs, but this is not
7825 + * This code gives a bonus to interactive tasks. We update
7826 + * an 'average sleep time' value here, based on
7827 + * sleep_timestamp. The more time a task spends sleeping,
7828 + * the higher the average gets - and the higher the priority
7829 + * boost gets as well.
7831 - if (tsk == idle_task(cpu)) {
7832 -#if defined(__i386__) && defined(CONFIG_SMP)
7834 - * Check if two siblings are idle in the same
7835 - * physical package. Use them if found.
7837 - if (smp_num_siblings == 2) {
7838 - if (cpu_curr(cpu_sibling_map[cpu]) ==
7839 - idle_task(cpu_sibling_map[cpu])) {
7840 - oldest_idle = last_schedule(cpu);
7847 - if (last_schedule(cpu) < oldest_idle) {
7848 - oldest_idle = last_schedule(cpu);
7852 - if (oldest_idle == (cycles_t)-1) {
7853 - int prio = preemption_goodness(tsk, p, cpu);
7855 - if (prio > max_prio) {
7864 - if (oldest_idle != (cycles_t)-1) {
7865 - best_cpu = tsk->processor;
7866 - goto send_now_idle;
7868 - tsk->need_resched = 1;
7869 - if (tsk->processor != this_cpu)
7870 - smp_send_reschedule(tsk->processor);
7871 + p->sleep_timestamp = jiffies;
7872 + p->sleep_avg += sleep_time;
7873 + if (p->sleep_avg > MAX_SLEEP_AVG)
7874 + p->sleep_avg = MAX_SLEEP_AVG;
7875 + p->prio = effective_prio(p);
7879 + __enqueue_task(p, array, parent);
7884 - int this_cpu = smp_processor_id();
7885 - struct task_struct *tsk;
7886 +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
7889 + if (p->state == TASK_UNINTERRUPTIBLE)
7890 + rq->nr_uninterruptible++;
7891 + dequeue_task(p, p->array);
7895 +static inline void resched_task(task_t *p)
7900 - tsk = cpu_curr(this_cpu);
7901 - if (preemption_goodness(tsk, p, this_cpu) > 0)
7902 - tsk->need_resched = 1;
7903 + need_resched = p->need_resched;
7904 + set_tsk_need_resched(p);
7905 + if (!need_resched && (p->cpu != smp_processor_id()))
7906 + smp_send_reschedule(p->cpu);
7908 + set_tsk_need_resched(p);
7917 - * This has to add the process to the _end_ of the
7918 - * run-queue, not the beginning. The goodness value will
7919 - * determine whether this process will run next. This is
7920 - * important to get SCHED_FIFO and SCHED_RR right, where
7921 - * a process that is either pre-empted or its time slice
7922 - * has expired, should be moved to the tail of the run
7923 - * queue for its priority - Bhavesh Davda
7924 + * Wait for a process to unschedule. This is used by the exit() and
7927 -static inline void add_to_runqueue(struct task_struct * p)
7928 +void wait_task_inactive(task_t * p)
7930 - list_add_tail(&p->run_list, &runqueue_head);
7932 + unsigned long flags;
7937 + if (unlikely(rq->curr == p)) {
7942 + rq = task_rq_lock(p, &flags);
7943 + if (unlikely(rq->curr == p)) {
7944 + task_rq_unlock(rq, &flags);
7947 + task_rq_unlock(rq, &flags);
7950 -static inline void move_last_runqueue(struct task_struct * p)
7952 + * Kick the remote CPU if the task is running currently,
7953 + * this code is used by the signal code to signal tasks
7954 + * which are in user-mode as quickly as possible.
7956 + * (Note that we do this lockless - if the task does anything
7957 + * while the message is in flight then it will notice the
7958 + * sigpending condition anyway.)
7960 +void kick_if_running(task_t * p)
7962 - list_del(&p->run_list);
7963 - list_add_tail(&p->run_list, &runqueue_head);
7964 + if (p == task_rq(p)->curr && p->cpu != smp_processor_id())
7970 +static int FASTCALL(reschedule_idle(task_t * p));
7971 +static void FASTCALL(load_balance(runqueue_t *this_rq, int idle));
7976 * Wake up a process. Put it on the run-queue if it's not
7977 @@ -345,429 +338,721 @@
7978 * progress), and as such you're allowed to do the simpler
7979 * "current->state = TASK_RUNNING" to mark yourself runnable
7980 * without the overhead of this.
7982 + * returns failure only if the task is already active.
7984 -static inline int try_to_wake_up(struct task_struct * p, int synchronous)
7985 +static int try_to_wake_up(task_t * p, int sync)
7987 unsigned long flags;
7992 + int migrated_to_idle = 0;
7998 + rq = task_rq_lock(p, &flags);
7999 + old_state = p->state;
8002 + if (likely(rq->curr != p)) {
8004 + if (unlikely(sync)) {
8005 + if (p->cpu != smp_processor_id() &&
8006 + p->cpus_allowed & (1UL << smp_processor_id())) {
8007 + p->cpu = smp_processor_id();
8008 + goto migrated_task;
8011 + if (reschedule_idle(p))
8012 + goto migrated_task;
8016 + if (old_state == TASK_UNINTERRUPTIBLE)
8017 + rq->nr_uninterruptible--;
8018 + activate_task(p, rq);
8019 + if (p->prio < rq->curr->prio)
8020 + resched_task(rq->curr);
8023 + p->state = TASK_RUNNING;
8027 - * We want the common case fall through straight, thus the goto.
8028 + * Subtle: we can load_balance only here (before unlock)
8029 + * because it can internally drop the lock. Claim
8030 + * that the cpu is running so it will be a light rebalance,
8031 + * if this cpu will go idle soon schedule() will trigger the
8032 + * idle rescheduling balancing by itself.
8034 - spin_lock_irqsave(&runqueue_lock, flags);
8035 - p->state = TASK_RUNNING;
8036 - if (task_on_runqueue(p))
8038 - add_to_runqueue(p);
8039 - if (!synchronous || !(p->cpus_allowed & (1UL << smp_processor_id())))
8040 - reschedule_idle(p);
8043 - spin_unlock_irqrestore(&runqueue_lock, flags);
8044 + if (success && migrated_to_idle)
8045 + load_balance(rq, 0);
8048 + task_rq_unlock(rq, &flags);
8054 + task_rq_unlock(rq, &flags);
8055 + migrated_to_idle = 1;
8056 + goto repeat_lock_task;
8060 -inline int fastcall wake_up_process(struct task_struct * p)
8061 +int wake_up_process(task_t * p)
8063 return try_to_wake_up(p, 0);
8066 -static void process_timeout(unsigned long __data)
8067 +void wake_up_forked_process(task_t * p)
8069 - struct task_struct * p = (struct task_struct *) __data;
8071 + task_t * parent = current;
8073 - wake_up_process(p);
8076 + spin_lock_irq(&rq->lock);
8079 - * schedule_timeout - sleep until timeout
8080 - * @timeout: timeout value in jiffies
8082 - * Make the current task sleep until @timeout jiffies have
8083 - * elapsed. The routine will return immediately unless
8084 - * the current task state has been set (see set_current_state()).
8086 - * You can set the task state as follows -
8088 - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
8089 - * pass before the routine returns. The routine will return 0
8091 - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
8092 - * delivered to the current task. In this case the remaining time
8093 - * in jiffies will be returned, or 0 if the timer expired in time
8095 - * The current task state is guaranteed to be TASK_RUNNING when this
8096 - * routine returns.
8098 - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
8099 - * the CPU away without a bound on the timeout. In this case the return
8100 - * value will be %MAX_SCHEDULE_TIMEOUT.
8102 - * In all cases the return value is guaranteed to be non-negative.
8104 -signed long fastcall schedule_timeout(signed long timeout)
8106 - struct timer_list timer;
8107 - unsigned long expire;
8108 + p->state = TASK_RUNNING;
8109 + if (likely(!rt_task(p) && parent->array)) {
8111 + * We decrease the sleep average of forked
8112 + * children, to keep max-interactive tasks
8113 + * from forking tasks that are max-interactive.
8114 + * CHILD_PENALTY is set to 50% since we have
8115 + * no clue if this is still an interactive
8116 + * task like the parent or if this will be a
8117 + * cpu bound task. The parent isn't touched
8118 + * as we don't make assumption about the parent
8119 + * changing behaviour after the child is forked.
8121 + parent->sleep_avg = parent->sleep_avg * PARENT_PENALTY / 100;
8122 + p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
8126 - case MAX_SCHEDULE_TIMEOUT:
8128 - * These two special cases are useful to be comfortable
8129 - * in the caller. Nothing more. We could take
8130 - * MAX_SCHEDULE_TIMEOUT from one of the negative value
8131 - * but I' d like to return a valid offset (>=0) to allow
8132 - * the caller to do everything it want with the retval.
8133 + * For its first schedule keep the child at the same
8134 + * priority (i.e. in the same list) of the parent,
8135 + * activate_forked_task() will take care to put the
8136 + * child in front of the parent (lifo) to guarantee a
8137 + * schedule-child-first behaviour after fork.
8142 + p->prio = parent->prio;
8145 - * Another bit of PARANOID. Note that the retval will be
8146 - * 0 since no piece of kernel is supposed to do a check
8147 - * for a negative retval of schedule_timeout() (since it
8148 - * should never happens anyway). You just have the printk()
8149 - * that will tell you if something is gone wrong and where.
8150 + * Take the usual wakeup path if it's RT or if
8151 + * it's a child of the first idle task (during boot
8156 - printk(KERN_ERR "schedule_timeout: wrong timeout "
8157 - "value %lx from %p\n", timeout,
8158 - __builtin_return_address(0));
8159 - current->state = TASK_RUNNING;
8162 + p->prio = effective_prio(p);
8166 - expire = timeout + jiffies;
8167 + p->cpu = smp_processor_id();
8168 + __activate_task(p, rq, parent);
8169 + spin_unlock_irq(&rq->lock);
8172 - init_timer(&timer);
8173 - timer.expires = expire;
8174 - timer.data = (unsigned long) current;
8175 - timer.function = process_timeout;
8177 + * Potentially available exiting-child timeslices are
8178 + * retrieved here - this way the parent does not get
8179 + * penalized for creating too many processes.
8181 + * (this cannot be used to 'generate' timeslices
8182 + * artificially, because any timeslice recovered here
8183 + * was given away by the parent in the first place.)
8185 +void sched_exit(task_t * p)
8188 + if (p->first_time_slice) {
8189 + current->time_slice += p->time_slice;
8190 + if (unlikely(current->time_slice > MAX_TIMESLICE))
8191 + current->time_slice = MAX_TIMESLICE;
8196 - add_timer(&timer);
8198 - del_timer_sync(&timer);
8200 +asmlinkage void schedule_tail(task_t *prev)
8202 + finish_arch_switch(this_rq(), prev);
8206 +static inline task_t * context_switch(task_t *prev, task_t *next)
8208 + struct mm_struct *mm = next->mm;
8209 + struct mm_struct *oldmm = prev->active_mm;
8211 - timeout = expire - jiffies;
8212 + if (unlikely(!mm)) {
8213 + next->active_mm = oldmm;
8214 + atomic_inc(&oldmm->mm_count);
8215 + enter_lazy_tlb(oldmm, next, smp_processor_id());
8217 + switch_mm(oldmm, mm, next, smp_processor_id());
8220 - return timeout < 0 ? 0 : timeout;
8221 + if (unlikely(!prev->mm)) {
8222 + prev->active_mm = NULL;
8226 + /* Here we just switch the register state and the stack. */
8227 + switch_to(prev, next, prev);
8233 - * schedule_tail() is getting called from the fork return path. This
8234 - * cleans up all remaining scheduler things, without impacting the
8237 -static inline void __schedule_tail(struct task_struct *prev)
8238 +unsigned long nr_running(void)
8242 + unsigned long i, sum = 0;
8245 - * prev->policy can be written from here only before `prev'
8246 - * can be scheduled (before setting prev->cpus_runnable to ~0UL).
8247 - * Of course it must also be read before allowing prev
8248 - * to be rescheduled, but since the write depends on the read
8249 - * to complete, wmb() is enough. (the spin_lock() acquired
8250 - * before setting cpus_runnable is not enough because the spin_lock()
8251 - * common code semantics allows code outside the critical section
8252 - * to enter inside the critical section)
8254 - policy = prev->policy;
8255 - prev->policy = policy & ~SCHED_YIELD;
8257 + for (i = 0; i < smp_num_cpus; i++)
8258 + sum += cpu_rq(cpu_logical_map(i))->nr_running;
8261 - * fast path falls through. We have to clear cpus_runnable before
8262 - * checking prev->state to avoid a wakeup race. Protect against
8263 - * the task exiting early.
8266 - task_release_cpu(prev);
8268 - if (prev->state == TASK_RUNNING)
8269 - goto needs_resched;
8274 - task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */
8276 +/* Note: the per-cpu information is useful only to get the cumulative result */
8277 +unsigned long nr_uninterruptible(void)
8279 + unsigned long i, sum = 0;
8282 - * Slow path - we 'push' the previous process and
8283 - * reschedule_idle() will attempt to find a new
8284 - * processor for it. (but it might preempt the
8285 - * current process as well.) We must take the runqueue
8286 - * lock and re-check prev->state to be correct. It might
8287 - * still happen that this process has a preemption
8288 - * 'in progress' already - but this is not a problem and
8289 - * might happen in other circumstances as well.
8293 - unsigned long flags;
8294 + for (i = 0; i < smp_num_cpus; i++)
8295 + sum += cpu_rq(cpu_logical_map(i))->nr_uninterruptible;
8298 - * Avoid taking the runqueue lock in cases where
8299 - * no preemption-check is necessery:
8301 - if ((prev == idle_task(smp_processor_id())) ||
8302 - (policy & SCHED_YIELD))
8307 - spin_lock_irqsave(&runqueue_lock, flags);
8308 - if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev))
8309 - reschedule_idle(prev);
8310 - spin_unlock_irqrestore(&runqueue_lock, flags);
8314 - prev->policy &= ~SCHED_YIELD;
8315 -#endif /* CONFIG_SMP */
8316 +unsigned long nr_context_switches(void)
8318 + unsigned long i, sum = 0;
8320 + for (i = 0; i < smp_num_cpus; i++)
8321 + sum += cpu_rq(cpu_logical_map(i))->nr_switches;
8326 -asmlinkage void schedule_tail(struct task_struct *prev)
8327 +inline int idle_cpu(int cpu)
8329 - __schedule_tail(prev);
8330 + return cpu_curr(cpu) == cpu_rq(cpu)->idle;
8335 - * 'schedule()' is the scheduler function. It's a very simple and nice
8336 - * scheduler: it's not perfect, but certainly works for most things.
8338 - * The goto is "interesting".
8340 - * NOTE!! Task 0 is the 'idle' task, which gets called when no other
8341 - * tasks can run. It can not be killed, and it cannot sleep. The 'state'
8342 - * information in task[0] is never used.
8343 + * Lock the busiest runqueue as well, this_rq is locked already.
8344 + * Recalculate nr_running if we have to drop the runqueue lock.
8346 -asmlinkage void schedule(void)
8347 +static inline unsigned int double_lock_balance(runqueue_t *this_rq,
8348 + runqueue_t *busiest, int this_cpu, int idle, unsigned int nr_running)
8350 - struct schedule_data * sched_data;
8351 - struct task_struct *prev, *next, *p;
8352 - struct list_head *tmp;
8354 + if (unlikely(!spin_trylock(&busiest->lock))) {
8355 + if (busiest < this_rq) {
8356 + spin_unlock(&this_rq->lock);
8357 + spin_lock(&busiest->lock);
8358 + spin_lock(&this_rq->lock);
8359 + /* Need to recalculate nr_running */
8360 + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
8361 + nr_running = this_rq->nr_running;
8363 + nr_running = this_rq->prev_nr_running[this_cpu];
8365 + spin_lock(&busiest->lock);
8367 + return nr_running;
8371 + * Move a task from a remote runqueue to the local runqueue.
8372 + * Both runqueues must be locked.
8374 +static inline int pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu)
8378 - spin_lock_prefetch(&runqueue_lock);
8379 + dequeue_task(p, src_array);
8380 + src_rq->nr_running--;
8381 + p->cpu = this_cpu;
8382 + this_rq->nr_running++;
8383 + enqueue_task(p, this_rq->active);
8385 + * Note that idle threads have a prio of MAX_PRIO, for this test
8386 + * to be always true for them.
8388 + if (p->prio < this_rq->curr->prio)
8391 - BUG_ON(!current->active_mm);
8394 - this_cpu = prev->processor;
8398 - if (unlikely(in_interrupt())) {
8399 - printk("Scheduling in interrupt\n");
8401 +static inline int idle_cpu_reschedule(task_t * p, int cpu)
8403 + if (unlikely(!(p->cpus_allowed & (1UL << cpu))))
8405 + return idle_cpu(cpu);
8408 +#include <linux/smp_balance.h>
8410 +static int reschedule_idle(task_t * p)
8412 + int p_cpu = p->cpu, i;
8414 + if (idle_cpu(p_cpu))
8417 + p_cpu = cpu_number_map(p_cpu);
8419 + for (i = (p_cpu + 1) % smp_num_cpus;
8421 + i = (i + 1) % smp_num_cpus) {
8422 + int physical = cpu_logical_map(i);
8424 + if (idle_cpu_reschedule(p, physical)) {
8425 + physical = arch_reschedule_idle_override(p, physical);
8426 + p->cpu = physical;
8431 - release_kernel_lock(prev, this_cpu);
8436 + * Current runqueue is empty, or rebalance tick: if there is an
8437 + * inbalance (current runqueue is too short) then pull from
8438 + * busiest runqueue(s).
8440 + * We call this with the current runqueue locked,
8443 +static void load_balance(runqueue_t *this_rq, int idle)
8445 + int imbalance, nr_running, load, max_load,
8446 + idx, i, this_cpu = this_rq - runqueues;
8448 + runqueue_t *busiest, *rq_src;
8449 + prio_array_t *array;
8450 + struct list_head *head, *curr;
8454 - * 'sched_data' is protected by the fact that we can run
8455 - * only one process per CPU.
8456 + * Handle architecture-specific balancing, such as hyperthreading.
8458 - sched_data = & aligned_data[this_cpu].schedule_data;
8459 + if (arch_load_balance(this_cpu, idle))
8462 - spin_lock_irq(&runqueue_lock);
8465 + * We search all runqueues to find the most busy one.
8466 + * We do this lockless to reduce cache-bouncing overhead,
8467 + * we re-check the 'best' source CPU later on again, with
8470 + * We fend off statistical fluctuations in runqueue lengths by
8471 + * saving the runqueue length during the previous load-balancing
8472 + * operation and using the smaller one the current and saved lengths.
8473 + * If a runqueue is long enough for a longer amount of time then
8474 + * we recognize it and pull tasks from it.
8476 + * The 'current runqueue length' is a statistical maximum variable,
8477 + * for that one we take the longer one - to avoid fluctuations in
8478 + * the other direction. So for a load-balance to happen it needs
8479 + * stable long runqueue on the target CPU and stable short runqueue
8480 + * on the local runqueue.
8482 + * We make an exception if this CPU is about to become idle - in
8483 + * that case we are less picky about moving a task across CPUs and
8484 + * take what can be taken.
8486 + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
8487 + nr_running = this_rq->nr_running;
8489 + nr_running = this_rq->prev_nr_running[this_cpu];
8491 - /* move an exhausted RR process to be last.. */
8492 - if (unlikely(prev->policy == SCHED_RR))
8493 - if (!prev->counter) {
8494 - prev->counter = NICE_TO_TICKS(prev->nice);
8495 - move_last_runqueue(prev);
8499 + for (i = 0; i < smp_num_cpus; i++) {
8500 + int logical = cpu_logical_map(i);
8502 - switch (prev->state) {
8503 - case TASK_INTERRUPTIBLE:
8504 - if (signal_pending(prev)) {
8505 - prev->state = TASK_RUNNING;
8509 - del_from_runqueue(prev);
8510 - case TASK_RUNNING:;
8511 + rq_src = cpu_rq(logical);
8512 + if (idle || (rq_src->nr_running < this_rq->prev_nr_running[logical]))
8513 + load = rq_src->nr_running;
8515 + load = this_rq->prev_nr_running[logical];
8516 + this_rq->prev_nr_running[logical] = rq_src->nr_running;
8518 + if ((load > max_load) && (rq_src != this_rq)) {
8523 - prev->need_resched = 0;
8525 + if (likely(!busiest))
8528 + imbalance = (max_load - nr_running) / 2;
8530 + /* It needs an at least ~25% imbalance to trigger balancing. */
8531 + if (!idle && (imbalance < (max_load + 3)/4))
8535 - * this is the scheduler proper:
8536 + * Make sure nothing significant changed since we checked the
8537 + * runqueue length.
8539 + if (double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running) > nr_running ||
8540 + busiest->nr_running < max_load)
8541 + goto out_unlock_retry;
8545 - * Default process to select..
8546 + * We first consider expired tasks. Those will likely not be
8547 + * executed in the near future, and they are most likely to
8548 + * be cache-cold, thus switching CPUs has the least effect
8551 - next = idle_task(this_cpu);
8553 - list_for_each(tmp, &runqueue_head) {
8554 - p = list_entry(tmp, struct task_struct, run_list);
8555 - if (can_schedule(p, this_cpu)) {
8556 - int weight = goodness(p, this_cpu, prev->active_mm);
8558 - c = weight, next = p;
8559 + if (busiest->expired->nr_active)
8560 + array = busiest->expired;
8562 + array = busiest->active;
8566 + /* Start searching at priority 0: */
8570 + idx = sched_find_first_bit(array->bitmap);
8572 + idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
8573 + if (idx == MAX_PRIO) {
8574 + if (array == busiest->expired) {
8575 + array = busiest->active;
8581 - /* Do we need to re-calculate counters? */
8582 - if (unlikely(!c)) {
8583 - struct task_struct *p;
8585 - spin_unlock_irq(&runqueue_lock);
8586 - read_lock(&tasklist_lock);
8588 - p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
8589 - read_unlock(&tasklist_lock);
8590 - spin_lock_irq(&runqueue_lock);
8591 - goto repeat_schedule;
8592 + head = array->queue + idx;
8593 + curr = head->prev;
8595 + tmp = list_entry(curr, task_t, run_list);
8598 + * We do not migrate tasks that are:
8599 + * 1) running (obviously), or
8600 + * 2) cannot be migrated to this CPU due to cpus_allowed, or
8601 + * 3) are cache-hot on their current CPU.
8604 +#define CAN_MIGRATE_TASK(p,rq,this_cpu) \
8605 + ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \
8606 + ((p) != (rq)->curr) && \
8607 + ((p)->cpus_allowed & (1UL << (this_cpu))))
8609 + curr = curr->prev;
8611 + if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
8617 + resched |= pull_task(busiest, array, tmp, this_rq, this_cpu);
8618 + if (--imbalance > 0) {
8625 + spin_unlock(&busiest->lock);
8627 + resched_task(this_rq->curr);
8630 + spin_unlock(&busiest->lock);
8635 - * from this point on nothing can prevent us from
8636 - * switching to the next task, save this fact in
8639 - sched_data->curr = next;
8640 - task_set_cpu(next, this_cpu);
8641 - spin_unlock_irq(&runqueue_lock);
8643 - if (unlikely(prev == next)) {
8644 - /* We won't go through the normal tail, so do this by hand */
8645 - prev->policy &= ~SCHED_YIELD;
8646 - goto same_process;
8648 + * One of the idle_cpu_tick() or the busy_cpu_tick() function will
8649 + * gets called every timer tick, on every CPU. Our balancing action
8650 + * frequency and balancing agressivity depends on whether the CPU is
8653 + * busy-rebalance every 250 msecs. idle-rebalance every 100 msec.
8655 +#define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
8656 +#define IDLE_REBALANCE_TICK (HZ/10 ?: 1)
8658 +static inline void idle_tick(void)
8660 + if (unlikely(time_before_eq(this_rq()->last_jiffy + IDLE_REBALANCE_TICK, jiffies))) {
8661 + spin_lock(&this_rq()->lock);
8662 + load_balance(this_rq(), 1);
8663 + spin_unlock(&this_rq()->lock);
8664 + this_rq()->last_jiffy = jiffies;
8670 - * maintain the per-process 'last schedule' value.
8671 - * (this has to be recalculated even if we reschedule to
8672 - * the same process) Currently this is only used on SMP,
8673 - * and it's approximate, so we do not have to maintain
8674 - * it while holding the runqueue spinlock.
8676 - sched_data->last_schedule = get_cycles();
8680 - * We drop the scheduler lock early (it's a global spinlock),
8681 - * thus we have to lock the previous process from getting
8682 - * rescheduled during switch_to().
8685 + * We place interactive tasks back into the active array, if possible.
8687 + * To guarantee that this does not starve expired tasks we ignore the
8688 + * interactivity of a task if the first expired task had to wait more
8689 + * than a 'reasonable' amount of time. This deadline timeout is
8690 + * load-dependent, as the frequency of array switched decreases with
8691 + * increasing number of running tasks:
8693 +#define EXPIRED_STARVING(rq) \
8694 + ((rq)->expired_timestamp && \
8695 + (jiffies - (rq)->expired_timestamp >= \
8696 + STARVATION_LIMIT * ((rq)->nr_running) + 1))
8698 -#endif /* CONFIG_SMP */
8700 + * This function gets called by the timer code, with HZ frequency.
8701 + * We call it with interrupts disabled.
8703 +void scheduler_tick(int user_tick, int system)
8705 + int cpu = smp_processor_id();
8706 + runqueue_t *rq = this_rq();
8707 + task_t *p = current;
8709 - kstat.context_swtch++;
8711 - * there are 3 processes which are affected by a context switch:
8713 - * prev == .... ==> (last => next)
8715 - * It's the 'much more previous' 'prev' that is on next's stack,
8716 - * but prev is set to (the just run) 'last' process by switch_to().
8717 - * This might sound slightly confusing but makes tons of sense.
8719 - prepare_to_switch();
8721 - struct mm_struct *mm = next->mm;
8722 - struct mm_struct *oldmm = prev->active_mm;
8724 - BUG_ON(next->active_mm);
8725 - next->active_mm = oldmm;
8726 - atomic_inc(&oldmm->mm_count);
8727 - enter_lazy_tlb(oldmm, next, this_cpu);
8729 - BUG_ON(next->active_mm != mm);
8730 - switch_mm(oldmm, mm, next, this_cpu);
8731 + if (p == rq->idle) {
8732 + if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
8733 + kstat.per_cpu_system[cpu] += system;
8739 + if (TASK_NICE(p) > 0)
8740 + kstat.per_cpu_nice[cpu] += user_tick;
8742 + kstat.per_cpu_user[cpu] += user_tick;
8743 + kstat.per_cpu_system[cpu] += system;
8745 + /* Task might have expired already, but not scheduled off yet */
8746 + if (p->array != rq->active) {
8747 + set_tsk_need_resched(p);
8750 + spin_lock(&rq->lock);
8751 + if (unlikely(rt_task(p))) {
8753 + * RR tasks need a special form of timeslice management.
8754 + * FIFO tasks have no timeslices.
8756 + if ((p->policy == SCHED_RR) && !--p->time_slice) {
8757 + p->time_slice = TASK_TIMESLICE(p);
8758 + p->first_time_slice = 0;
8759 + set_tsk_need_resched(p);
8761 + /* put it at the end of the queue: */
8762 + dequeue_task(p, rq->active);
8763 + enqueue_task(p, rq->active);
8768 + * The task was running during this tick - update the
8769 + * time slice counter and the sleep average. Note: we
8770 + * do not update a process's priority until it either
8771 + * goes to sleep or uses up its timeslice. This makes
8772 + * it possible for interactive tasks to use up their
8773 + * timeslices at their highest priority levels.
8777 + if (!--p->time_slice) {
8778 + dequeue_task(p, rq->active);
8779 + set_tsk_need_resched(p);
8780 + p->prio = effective_prio(p);
8781 + p->time_slice = TASK_TIMESLICE(p);
8782 + p->first_time_slice = 0;
8784 + if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
8785 + if (!rq->expired_timestamp)
8786 + rq->expired_timestamp = jiffies;
8787 + enqueue_task(p, rq->expired);
8789 + enqueue_task(p, rq->active);
8793 + if (unlikely(time_before_eq(this_rq()->last_jiffy + BUSY_REBALANCE_TICK, jiffies))) {
8794 + load_balance(rq, 0);
8795 + rq->last_jiffy = jiffies;
8798 + spin_unlock(&rq->lock);
8801 +void scheduling_functions_start_here(void) { }
8804 + * 'schedule()' is the main scheduler function.
8806 +asmlinkage void schedule(void)
8808 + task_t *prev, *next;
8810 + prio_array_t *array;
8811 + struct list_head *queue;
8814 + if (unlikely(in_interrupt()))
8818 - prev->active_mm = NULL;
8824 + release_kernel_lock(prev, smp_processor_id());
8825 + prev->sleep_timestamp = jiffies;
8826 + spin_lock_irq(&rq->lock);
8828 + switch (prev->state) {
8829 + case TASK_INTERRUPTIBLE:
8830 + if (unlikely(signal_pending(prev))) {
8831 + prev->state = TASK_RUNNING;
8835 + deactivate_task(prev, rq);
8836 + case TASK_RUNNING:
8842 + if (unlikely(!rq->nr_running)) {
8844 + load_balance(rq, 2);
8845 + rq->last_jiffy = jiffies;
8846 + if (rq->nr_running)
8847 + goto pick_next_task;
8850 + rq->expired_timestamp = 0;
8851 + goto switch_tasks;
8855 - * This just switches the register state and the
8858 - switch_to(prev, next, prev);
8859 - __schedule_tail(prev);
8860 + array = rq->active;
8861 + if (unlikely(!array->nr_active)) {
8863 + * Switch the active and expired arrays.
8865 + rq->active = rq->expired;
8866 + rq->expired = array;
8867 + array = rq->active;
8868 + rq->expired_timestamp = 0;
8871 + idx = sched_find_first_bit(array->bitmap);
8872 + queue = array->queue + idx;
8873 + next = list_entry(queue->next, task_t, run_list);
8877 + clear_tsk_need_resched(prev);
8879 + if (likely(prev != next)) {
8880 + rq->nr_switches++;
8883 + prepare_arch_switch(rq, next);
8884 + prev = context_switch(prev, next);
8887 + finish_arch_switch(rq, prev);
8889 + spin_unlock_irq(&rq->lock);
8892 reacquire_kernel_lock(current);
8893 - if (current->need_resched)
8894 - goto need_resched_back;
8896 + if (need_resched())
8897 + goto need_resched;
8901 - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything
8902 - * up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the
8903 - * non-exclusive tasks and one exclusive task.
8904 + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
8905 + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
8906 + * number) then we wake all the non-exclusive tasks and one exclusive task.
8908 * There are circumstances in which we can try to wake a task which has already
8909 - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero
8910 - * in this (rare) case, and we handle it by contonuing to scan the queue.
8911 + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
8912 + * zero in this (rare) case, and we handle it by continuing to scan the queue.
8914 -static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
8915 - int nr_exclusive, const int sync)
8916 +static inline void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync)
8918 struct list_head *tmp;
8919 - struct task_struct *p;
8921 - CHECK_MAGIC_WQHEAD(q);
8922 - WQ_CHECK_LIST_HEAD(&q->task_list);
8924 - list_for_each(tmp,&q->task_list) {
8925 - unsigned int state;
8926 - wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
8927 + unsigned int state;
8928 + wait_queue_t *curr;
8931 - CHECK_MAGIC(curr->__magic);
8932 + list_for_each(tmp, &q->task_list) {
8933 + curr = list_entry(tmp, wait_queue_t, task_list);
8936 - if (state & mode) {
8937 - WQ_NOTE_WAKER(curr);
8938 - if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
8939 + if ((state & mode) && try_to_wake_up(p, sync) &&
8940 + ((curr->flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive))
8946 -void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, int nr)
8947 +void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
8950 - unsigned long flags;
8951 - wq_read_lock_irqsave(&q->lock, flags);
8952 - __wake_up_common(q, mode, nr, 0);
8953 - wq_read_unlock_irqrestore(&q->lock, flags);
8955 + unsigned long flags;
8960 + wq_read_lock_irqsave(&q->lock, flags);
8961 + __wake_up_common(q, mode, nr_exclusive, 0);
8962 + wq_read_unlock_irqrestore(&q->lock, flags);
8965 -void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr)
8968 +void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
8971 - unsigned long flags;
8972 - wq_read_lock_irqsave(&q->lock, flags);
8973 - __wake_up_common(q, mode, nr, 1);
8974 - wq_read_unlock_irqrestore(&q->lock, flags);
8976 + unsigned long flags;
8981 + wq_read_lock_irqsave(&q->lock, flags);
8982 + if (likely(nr_exclusive))
8983 + __wake_up_common(q, mode, nr_exclusive, 1);
8985 + __wake_up_common(q, mode, nr_exclusive, 0);
8986 + wq_read_unlock_irqrestore(&q->lock, flags);
8991 void fastcall complete(struct completion *x)
8993 unsigned long flags;
8995 - spin_lock_irqsave(&x->wait.lock, flags);
8996 + wq_write_lock_irqsave(&x->wait.lock, flags);
8998 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, 0);
8999 - spin_unlock_irqrestore(&x->wait.lock, flags);
9000 + wq_write_unlock_irqrestore(&x->wait.lock, flags);
9003 void fastcall wait_for_completion(struct completion *x)
9005 - spin_lock_irq(&x->wait.lock);
9006 + wq_write_lock_irq(&x->wait.lock);
9008 DECLARE_WAITQUEUE(wait, current);
9010 @@ -775,14 +1060,14 @@
9011 __add_wait_queue_tail(&x->wait, &wait);
9013 __set_current_state(TASK_UNINTERRUPTIBLE);
9014 - spin_unlock_irq(&x->wait.lock);
9015 + wq_write_unlock_irq(&x->wait.lock);
9017 - spin_lock_irq(&x->wait.lock);
9018 + wq_write_lock_irq(&x->wait.lock);
9020 __remove_wait_queue(&x->wait, &wait);
9023 - spin_unlock_irq(&x->wait.lock);
9024 + wq_write_unlock_irq(&x->wait.lock);
9027 #define SLEEP_ON_VAR \
9028 @@ -850,43 +1135,40 @@
9030 void scheduling_functions_end_here(void) { }
9034 - * set_cpus_allowed() - change a given task's processor affinity
9035 - * @p: task to bind
9036 - * @new_mask: bitmask of allowed processors
9038 - * Upon return, the task is running on a legal processor. Note the caller
9039 - * must have a valid reference to the task: it must not exit() prematurely.
9040 - * This call can sleep; do not hold locks on call.
9042 -void set_cpus_allowed(struct task_struct *p, unsigned long new_mask)
9043 +void set_user_nice(task_t *p, long nice)
9045 - new_mask &= cpu_online_map;
9046 - BUG_ON(!new_mask);
9048 - p->cpus_allowed = new_mask;
9049 + unsigned long flags;
9050 + prio_array_t *array;
9053 + if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
9056 - * If the task is on a no-longer-allowed processor, we need to move
9057 - * it. If the task is not current, then set need_resched and send
9058 - * its processor an IPI to reschedule.
9059 + * We have to be careful, if called from sys_setpriority(),
9060 + * the task might be in the middle of scheduling on another CPU.
9062 - if (!(p->cpus_runnable & p->cpus_allowed)) {
9063 - if (p != current) {
9064 - p->need_resched = 1;
9065 - smp_send_reschedule(p->processor);
9067 + rq = task_rq_lock(p, &flags);
9069 + p->static_prio = NICE_TO_PRIO(nice);
9074 + dequeue_task(p, array);
9075 + p->static_prio = NICE_TO_PRIO(nice);
9076 + p->prio = NICE_TO_PRIO(nice);
9078 + enqueue_task(p, array);
9080 - * Wait until we are on a legal processor. If the task is
9081 - * current, then we should be on a legal processor the next
9082 - * time we reschedule. Otherwise, we need to wait for the IPI.
9083 + * If the task is running and lowered its priority,
9084 + * or increased its priority then reschedule its CPU:
9086 - while (!(p->cpus_runnable & p->cpus_allowed))
9088 + if (p == rq->curr)
9089 + resched_task(rq->curr);
9092 + task_rq_unlock(rq, &flags);
9094 -#endif /* CONFIG_SMP */
9098 @@ -898,7 +1180,7 @@
9100 asmlinkage long sys_nice(int increment)
9106 * Setpriority might change our priority at the same moment.
9107 @@ -914,32 +1196,46 @@
9111 - newprio = current->nice + increment;
9112 - if (newprio < -20)
9116 - current->nice = newprio;
9117 + nice = PRIO_TO_NICE(current->static_prio) + increment;
9122 + set_user_nice(current, nice);
9128 -static inline struct task_struct *find_process_by_pid(pid_t pid)
9130 + * This is the priority value as seen by users in /proc
9132 + * RT tasks are offset by -200. Normal tasks are centered
9133 + * around 0, value goes from -16 to +15.
9135 +int task_prio(task_t *p)
9137 - struct task_struct *tsk = current;
9138 + return p->prio - MAX_USER_RT_PRIO;
9142 - tsk = find_task_by_pid(pid);
9144 +int task_nice(task_t *p)
9146 + return TASK_NICE(p);
9149 +static inline task_t *find_process_by_pid(pid_t pid)
9151 + return pid ? find_task_by_pid(pid) : current;
9154 -static int setscheduler(pid_t pid, int policy,
9155 - struct sched_param *param)
9156 +static int setscheduler(pid_t pid, int policy, struct sched_param *param)
9158 struct sched_param lp;
9159 - struct task_struct *p;
9160 + prio_array_t *array;
9161 + unsigned long flags;
9167 if (!param || pid < 0)
9168 @@ -953,14 +1249,19 @@
9169 * We play safe to avoid deadlocks.
9171 read_lock_irq(&tasklist_lock);
9172 - spin_lock(&runqueue_lock);
9174 p = find_process_by_pid(pid);
9180 + goto out_unlock_tasklist;
9183 + * To be able to change p->policy safely, the apropriate
9184 + * runqueue lock must be held.
9186 + rq = task_rq_lock(p, &flags);
9191 @@ -969,40 +1270,48 @@
9192 policy != SCHED_OTHER)
9198 - * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid
9199 - * priority for SCHED_OTHER is 0.
9200 + * Valid priorities for SCHED_FIFO and SCHED_RR are
9201 + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_OTHER is 0.
9204 - if (lp.sched_priority < 0 || lp.sched_priority > 99)
9205 + if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
9207 if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
9211 - if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
9212 + if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
9213 !capable(CAP_SYS_NICE))
9215 if ((current->euid != p->euid) && (current->euid != p->uid) &&
9216 !capable(CAP_SYS_NICE))
9221 + deactivate_task(p, task_rq(p));
9224 p->rt_priority = lp.sched_priority;
9226 - current->need_resched = 1;
9227 + if (policy != SCHED_OTHER)
9228 + p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
9230 + p->prio = p->static_prio;
9232 + activate_task(p, task_rq(p));
9235 - spin_unlock(&runqueue_lock);
9236 + task_rq_unlock(rq, &flags);
9237 +out_unlock_tasklist:
9238 read_unlock_irq(&tasklist_lock);
9244 -asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
9245 +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
9246 struct sched_param *param)
9248 return setscheduler(pid, policy, param);
9249 @@ -1015,7 +1324,7 @@
9251 asmlinkage long sys_sched_getscheduler(pid_t pid)
9253 - struct task_struct *p;
9258 @@ -1026,7 +1335,7 @@
9259 read_lock(&tasklist_lock);
9260 p = find_process_by_pid(pid);
9262 - retval = p->policy & ~SCHED_YIELD;
9263 + retval = p->policy;
9264 read_unlock(&tasklist_lock);
9267 @@ -1035,7 +1344,7 @@
9269 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
9271 - struct task_struct *p;
9273 struct sched_param lp;
9276 @@ -1066,42 +1375,64 @@
9278 asmlinkage long sys_sched_yield(void)
9281 - * Trick. sched_yield() first counts the number of truly
9282 - * 'pending' runnable processes, then returns if it's
9283 - * only the current processes. (This test does not have
9284 - * to be atomic.) In threaded applications this optimization
9285 - * gets triggered quite often.
9287 + runqueue_t *rq = this_rq();
9288 + prio_array_t *array;
9291 - int nr_pending = nr_running;
9292 + spin_lock_irq(&rq->lock);
9294 + if (unlikely(rq->nr_running == 1)) {
9295 + spin_unlock_irq(&rq->lock);
9301 + array = current->array;
9302 + if (unlikely(rt_task(current))) {
9303 + list_del(¤t->run_list);
9304 + list_add_tail(¤t->run_list, array->queue + current->prio);
9308 - // Subtract non-idle processes running on other CPUs.
9309 - for (i = 0; i < smp_num_cpus; i++) {
9310 - int cpu = cpu_logical_map(i);
9311 - if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
9313 + if (unlikely(array == rq->expired) && rq->active->nr_active)
9316 + list_del(¤t->run_list);
9317 + if (!list_empty(array->queue + current->prio)) {
9318 + list_add(¤t->run_list, array->queue[current->prio].next);
9322 - // on UP this process is on the runqueue as well
9327 + __clear_bit(current->prio, array->bitmap);
9328 + if (likely(array == rq->active) && array->nr_active == 1) {
9330 - * This process can only be rescheduled by us,
9331 - * so this is safe without any locking.
9332 + * We're the last task in the active queue so
9333 + * we must move ourself to the expired array
9334 + * to avoid running again immediatly.
9336 - if (current->policy == SCHED_OTHER)
9337 - current->policy |= SCHED_YIELD;
9338 - current->need_resched = 1;
9340 - spin_lock_irq(&runqueue_lock);
9341 - move_last_runqueue(current);
9342 - spin_unlock_irq(&runqueue_lock);
9343 + array->nr_active--;
9344 + array = rq->expired;
9345 + array->nr_active++;
9348 + i = sched_find_first_bit(array->bitmap);
9350 + BUG_ON(i == MAX_PRIO);
9351 + BUG_ON(i == current->prio && array == current->array);
9353 + if (array == current->array && i < current->prio)
9354 + i = current->prio;
9356 + current->array = array;
9357 + current->prio = i;
9359 + list_add(¤t->run_list, array->queue[i].next);
9360 + __set_bit(i, array->bitmap);
9363 + spin_unlock_irq(&rq->lock);
9370 @@ -1113,14 +1444,13 @@
9374 - set_current_state(TASK_RUNNING);
9375 + __set_current_state(TASK_RUNNING);
9380 void __cond_resched(void)
9382 - set_current_state(TASK_RUNNING);
9383 + __set_current_state(TASK_RUNNING);
9387 @@ -1131,7 +1461,7 @@
9392 + ret = MAX_USER_RT_PRIO-1;
9396 @@ -1158,7 +1488,7 @@
9397 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
9400 - struct task_struct *p;
9402 int retval = -EINVAL;
9405 @@ -1168,8 +1498,8 @@
9406 read_lock(&tasklist_lock);
9407 p = find_process_by_pid(pid);
9409 - jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice),
9411 + jiffies_to_timespec(p->policy & SCHED_FIFO ?
9412 + 0 : TASK_TIMESLICE(p), &t);
9413 read_unlock(&tasklist_lock);
9415 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
9416 @@ -1177,14 +1507,14 @@
9420 -static void show_task(struct task_struct * p)
9421 +static void show_task(task_t * p)
9423 unsigned long free = 0;
9425 static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
9427 printk("%-13.13s ", p->comm);
9428 - state = p->state ? ffz(~p->state) + 1 : 0;
9429 + state = p->state ? __ffs(p->state) + 1 : 0;
9430 if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
9431 printk(stat_nam[state]);
9433 @@ -1225,7 +1555,7 @@
9434 printk(" (NOTLB)\n");
9437 - extern void show_trace_task(struct task_struct *tsk);
9438 + extern void show_trace_task(task_t *tsk);
9442 @@ -1247,7 +1577,7 @@
9444 void show_state(void)
9446 - struct task_struct *p;
9449 #if (BITS_PER_LONG == 32)
9451 @@ -1270,128 +1600,280 @@
9452 read_unlock(&tasklist_lock);
9456 - * reparent_to_init() - Reparent the calling kernel thread to the init task.
9458 - * If a kernel thread is launched as a result of a system call, or if
9459 - * it ever exits, it should generally reparent itself to init so that
9460 - * it is correctly cleaned up on exit.
9462 + * double_rq_lock - safely lock two runqueues
9464 - * The various task state such as scheduling policy and priority may have
9465 - * been inherited fro a user process, so we reset them to sane values here.
9466 + * Note this does not disable interrupts like task_rq_lock,
9467 + * you need to do so manually before calling.
9469 +static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
9472 + spin_lock(&rq1->lock);
9475 + spin_lock(&rq1->lock);
9476 + spin_lock(&rq2->lock);
9478 + spin_lock(&rq2->lock);
9479 + spin_lock(&rq1->lock);
9485 + * double_rq_unlock - safely unlock two runqueues
9487 - * NOTE that reparent_to_init() gives the caller full capabilities.
9488 + * Note this does not restore interrupts like task_rq_unlock,
9489 + * you need to do so manually after calling.
9491 -void reparent_to_init(void)
9492 +static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
9494 - struct task_struct *this_task = current;
9495 + spin_unlock(&rq1->lock);
9497 + spin_unlock(&rq2->lock);
9500 - write_lock_irq(&tasklist_lock);
9501 +void __init init_idle(task_t *idle, int cpu)
9503 + runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(idle->cpu);
9504 + unsigned long flags;
9506 - /* Reparent to init */
9507 - REMOVE_LINKS(this_task);
9508 - this_task->p_pptr = child_reaper;
9509 - this_task->p_opptr = child_reaper;
9510 - SET_LINKS(this_task);
9511 + __save_flags(flags);
9513 + double_rq_lock(idle_rq, rq);
9515 + idle_rq->curr = idle_rq->idle = idle;
9516 + deactivate_task(idle, rq);
9517 + idle->array = NULL;
9518 + idle->prio = MAX_PRIO;
9519 + idle->state = TASK_RUNNING;
9521 + double_rq_unlock(idle_rq, rq);
9522 + set_tsk_need_resched(idle);
9523 + __restore_flags(flags);
9526 +extern void init_timervecs(void);
9527 +extern void timer_bh(void);
9528 +extern void tqueue_bh(void);
9529 +extern void immediate_bh(void);
9531 +void __init sched_init(void)
9536 + for (i = 0; i < NR_CPUS; i++) {
9537 + prio_array_t *array;
9539 - /* Set the exit signal to SIGCHLD so we signal init on exit */
9540 - this_task->exit_signal = SIGCHLD;
9542 + rq->active = rq->arrays;
9543 + rq->expired = rq->arrays + 1;
9544 + spin_lock_init(&rq->lock);
9546 + INIT_LIST_HEAD(&rq->migration_queue);
9549 - /* We also take the runqueue_lock while altering task fields
9550 - * which affect scheduling decisions */
9551 - spin_lock(&runqueue_lock);
9552 + for (j = 0; j < 2; j++) {
9553 + array = rq->arrays + j;
9554 + for (k = 0; k < MAX_PRIO; k++) {
9555 + INIT_LIST_HEAD(array->queue + k);
9556 + __clear_bit(k, array->bitmap);
9558 + // delimiter for bitsearch
9559 + __set_bit(MAX_PRIO, array->bitmap);
9563 + * We have to do a little magic to get the first
9564 + * process right in SMP mode.
9567 + rq->curr = current;
9568 + rq->idle = current;
9569 + current->cpu = smp_processor_id();
9570 + wake_up_process(current);
9572 - this_task->ptrace = 0;
9573 - this_task->nice = DEF_NICE;
9574 - this_task->policy = SCHED_OTHER;
9575 - /* cpus_allowed? */
9576 - /* rt_priority? */
9578 - this_task->cap_effective = CAP_INIT_EFF_SET;
9579 - this_task->cap_inheritable = CAP_INIT_INH_SET;
9580 - this_task->cap_permitted = CAP_FULL_SET;
9581 - this_task->keep_capabilities = 0;
9582 - memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim)));
9583 - switch_uid(INIT_USER);
9585 + init_bh(TIMER_BH, timer_bh);
9586 + init_bh(TQUEUE_BH, tqueue_bh);
9587 + init_bh(IMMEDIATE_BH, immediate_bh);
9589 - spin_unlock(&runqueue_lock);
9590 - write_unlock_irq(&tasklist_lock);
9592 + * The boot idle thread does lazy MMU switching as well:
9594 + atomic_inc(&init_mm.mm_count);
9595 + enter_lazy_tlb(&init_mm, current, smp_processor_id());
9601 - * Put all the gunge required to become a kernel thread without
9602 - * attached user resources in one place where it belongs.
9604 + * This is how migration works:
9606 + * 1) we queue a migration_req_t structure in the source CPU's
9607 + * runqueue and wake up that CPU's migration thread.
9608 + * 2) we down() the locked semaphore => thread blocks.
9609 + * 3) migration thread wakes up (implicitly it forces the migrated
9610 + * thread off the CPU)
9611 + * 4) it gets the migration request and checks whether the migrated
9612 + * task is still in the wrong runqueue.
9613 + * 5) if it's in the wrong runqueue then the migration thread removes
9614 + * it and puts it into the right queue.
9615 + * 6) migration thread up()s the semaphore.
9616 + * 7) we wake up and the migration is done.
9620 + struct list_head list;
9622 + struct completion done;
9625 -void daemonize(void)
9627 + * Change a given task's CPU affinity. Migrate the process to a
9628 + * proper CPU and schedule it away if the CPU it's executing on
9629 + * is removed from the allowed bitmask.
9631 + * NOTE: the caller must have a valid reference to the task, the
9632 + * task must not exit() & deallocate itself prematurely. The
9633 + * call is not atomic; no spinlocks may be held.
9635 +void set_cpus_allowed(task_t *p, unsigned long new_mask)
9637 - struct fs_struct *fs;
9638 + unsigned long flags;
9639 + migration_req_t req;
9642 + new_mask &= cpu_online_map;
9646 + rq = task_rq_lock(p, &flags);
9647 + p->cpus_allowed = new_mask;
9649 - * If we were started as result of loading a module, close all of the
9650 - * user space pages. We don't need them, and if we didn't close them
9651 - * they would be locked into memory.
9652 + * Can the task run on the task's current CPU? If not then
9653 + * migrate the process off to a proper CPU.
9656 + if (new_mask & (1UL << p->cpu)) {
9657 + task_rq_unlock(rq, &flags);
9661 - current->session = 1;
9662 - current->pgrp = 1;
9663 - current->tty = NULL;
9665 + * If the task is not on a runqueue, then it is safe to
9666 + * simply update the task's cpu field.
9668 + if (!p->array && (p != rq->curr)) {
9669 + p->cpu = __ffs(p->cpus_allowed);
9670 + task_rq_unlock(rq, &flags);
9674 - /* Become as one with the init task */
9675 + init_completion(&req.done);
9677 + list_add(&req.list, &rq->migration_queue);
9678 + task_rq_unlock(rq, &flags);
9679 + wake_up_process(rq->migration_thread);
9681 - exit_fs(current); /* current->fs->count--; */
9682 - fs = init_task.fs;
9684 - atomic_inc(&fs->count);
9685 - exit_files(current);
9686 - current->files = init_task.files;
9687 - atomic_inc(¤t->files->count);
9688 + wait_for_completion(&req.done);
9691 -extern unsigned long wait_init_idle;
9692 +static __initdata int master_migration_thread;
9694 -void __init init_idle(void)
9695 +static int migration_thread(void * bind_cpu)
9697 - struct schedule_data * sched_data;
9698 - sched_data = &aligned_data[smp_processor_id()].schedule_data;
9699 + int cpu = cpu_logical_map((int) (long) bind_cpu);
9700 + struct sched_param param = { sched_priority: MAX_RT_PRIO-1 };
9704 - if (current != &init_task && task_on_runqueue(current)) {
9705 - printk("UGH! (%d:%d) was on the runqueue, removing.\n",
9706 - smp_processor_id(), current->pid);
9707 - del_from_runqueue(current);
9709 + sigfillset(¤t->blocked);
9710 + set_fs(KERNEL_DS);
9712 + * The first migration thread is started on the boot CPU, it
9713 + * migrates the other migration threads to their destination CPUs.
9715 + if (cpu != master_migration_thread) {
9716 + while (!cpu_rq(master_migration_thread)->migration_thread)
9718 + set_cpus_allowed(current, 1UL << cpu);
9720 - sched_data->curr = current;
9721 - sched_data->last_schedule = get_cycles();
9722 - clear_bit(current->processor, &wait_init_idle);
9724 + printk("migration_task %d on cpu=%d\n", cpu, smp_processor_id());
9725 + ret = setscheduler(0, SCHED_FIFO, ¶m);
9727 -extern void init_timervecs (void);
9729 + rq->migration_thread = current;
9731 -void __init sched_init(void)
9734 - * We have to do a little magic to get the first
9735 - * process right in SMP mode.
9737 - int cpu = smp_processor_id();
9739 + sprintf(current->comm, "migration_CPU%d", smp_processor_id());
9741 - init_task.processor = cpu;
9743 + runqueue_t *rq_src, *rq_dest;
9744 + struct list_head *head;
9745 + int cpu_src, cpu_dest;
9746 + migration_req_t *req;
9747 + unsigned long flags;
9750 - for(nr = 0; nr < PIDHASH_SZ; nr++)
9751 - pidhash[nr] = NULL;
9752 + spin_lock_irqsave(&rq->lock, flags);
9753 + head = &rq->migration_queue;
9754 + current->state = TASK_INTERRUPTIBLE;
9755 + if (list_empty(head)) {
9756 + spin_unlock_irqrestore(&rq->lock, flags);
9760 + req = list_entry(head->next, migration_req_t, list);
9761 + list_del_init(head->next);
9762 + spin_unlock_irqrestore(&rq->lock, flags);
9765 + cpu_dest = __ffs(p->cpus_allowed);
9766 + rq_dest = cpu_rq(cpu_dest);
9769 + rq_src = cpu_rq(cpu_src);
9771 + local_irq_save(flags);
9772 + double_rq_lock(rq_src, rq_dest);
9773 + if (p->cpu != cpu_src) {
9774 + double_rq_unlock(rq_src, rq_dest);
9775 + local_irq_restore(flags);
9778 + if (rq_src == rq) {
9779 + p->cpu = cpu_dest;
9781 + deactivate_task(p, rq_src);
9782 + activate_task(p, rq_dest);
9785 + double_rq_unlock(rq_src, rq_dest);
9786 + local_irq_restore(flags);
9789 + complete(&req->done);
9793 - init_bh(TIMER_BH, timer_bh);
9794 - init_bh(TQUEUE_BH, tqueue_bh);
9795 - init_bh(IMMEDIATE_BH, immediate_bh);
9796 +void __init migration_init(void)
9801 - * The boot idle thread does lazy MMU switching as well:
9803 - atomic_inc(&init_mm.mm_count);
9804 - enter_lazy_tlb(&init_mm, current, cpu);
9805 + master_migration_thread = smp_processor_id();
9806 + current->cpus_allowed = 1UL << master_migration_thread;
9808 + for (cpu = 0; cpu < smp_num_cpus; cpu++) {
9809 + if (kernel_thread(migration_thread, (void *) (long) cpu,
9810 + CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0)
9813 + current->cpus_allowed = -1L;
9815 + for (cpu = 0; cpu < smp_num_cpus; cpu++)
9816 + while (!cpu_rq(cpu_logical_map(cpu))->migration_thread)
9817 + schedule_timeout(2);
9820 +#endif /* CONFIG_SMP */
9821 diff -urN linux-2.4.24.org/kernel/signal.c linux-2.4.24/kernel/signal.c
9822 --- linux-2.4.24.org/kernel/signal.c 2004-02-04 20:47:26.821945338 +0100
9823 +++ linux-2.4.24/kernel/signal.c 2004-02-04 20:52:55.082667907 +0100
9824 @@ -507,12 +507,9 @@
9825 * process of changing - but no harm is done by that
9826 * other than doing an extra (lightweight) IPI interrupt.
9828 - spin_lock(&runqueue_lock);
9829 - if (task_has_cpu(t) && t->processor != smp_processor_id())
9830 - smp_send_reschedule(t->processor);
9831 - spin_unlock(&runqueue_lock);
9832 -#endif /* CONFIG_SMP */
9834 + if ((t->state == TASK_RUNNING) && (t->cpu != cpu()))
9835 + kick_if_running(t);
9837 if (t->state & TASK_INTERRUPTIBLE) {
9840 diff -urN linux-2.4.24.org/kernel/softirq.c linux-2.4.24/kernel/softirq.c
9841 --- linux-2.4.24.org/kernel/softirq.c 2004-02-04 20:47:27.211864234 +0100
9842 +++ linux-2.4.24/kernel/softirq.c 2004-02-04 20:52:55.110662084 +0100
9843 @@ -364,13 +364,13 @@
9844 int cpu = cpu_logical_map(bind_cpu);
9847 - current->nice = 19;
9848 + set_user_nice(current, 19);
9849 sigfillset(¤t->blocked);
9851 /* Migrate to the right CPU */
9852 - current->cpus_allowed = 1UL << cpu;
9853 - while (smp_processor_id() != cpu)
9855 + set_cpus_allowed(current, 1UL << cpu);
9859 sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu);
9865 -static __init int spawn_ksoftirqd(void)
9866 +__init int spawn_ksoftirqd(void)
9870 diff -urN linux-2.4.24.org/kernel/sys.c linux-2.4.24/kernel/sys.c
9871 --- linux-2.4.24.org/kernel/sys.c 2004-02-04 20:47:26.739962391 +0100
9872 +++ linux-2.4.24/kernel/sys.c 2004-02-04 20:52:55.139656054 +0100
9873 @@ -239,10 +239,10 @@
9875 if (error == -ESRCH)
9877 - if (niceval < p->nice && !capable(CAP_SYS_NICE))
9878 + if (niceval < task_nice(p) && !capable(CAP_SYS_NICE))
9881 - p->nice = niceval;
9882 + set_user_nice(p, niceval);
9884 read_unlock(&tasklist_lock);
9888 if (!proc_sel(p, which, who))
9890 - niceval = 20 - p->nice;
9891 + niceval = 20 - task_nice(p);
9892 if (niceval > retval)
9895 diff -urN linux-2.4.24.org/kernel/timer.c linux-2.4.24/kernel/timer.c
9896 --- linux-2.4.24.org/kernel/timer.c 2004-02-04 20:47:27.115884198 +0100
9897 +++ linux-2.4.24/kernel/timer.c 2004-02-04 20:52:55.155652727 +0100
9900 #include <asm/uaccess.h>
9902 +struct kernel_stat kstat;
9905 * Timekeeping variables
9907 @@ -598,25 +600,7 @@
9908 int cpu = smp_processor_id(), system = user_tick ^ 1;
9910 update_one_process(p, user_tick, system, cpu);
9912 - if (--p->counter <= 0) {
9915 - * SCHED_FIFO is priority preemption, so this is
9916 - * not the place to decide whether to reschedule a
9917 - * SCHED_FIFO task or not - Bhavesh Davda
9919 - if (p->policy != SCHED_FIFO) {
9920 - p->need_resched = 1;
9924 - kstat.per_cpu_nice[cpu] += user_tick;
9926 - kstat.per_cpu_user[cpu] += user_tick;
9927 - kstat.per_cpu_system[cpu] += system;
9928 - } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
9929 - kstat.per_cpu_system[cpu] += system;
9930 + scheduler_tick(user_tick, system);
9934 @@ -624,17 +608,7 @@
9936 static unsigned long count_active_tasks(void)
9938 - struct task_struct *p;
9939 - unsigned long nr = 0;
9941 - read_lock(&tasklist_lock);
9942 - for_each_task(p) {
9943 - if ((p->state == TASK_RUNNING ||
9944 - (p->state & TASK_UNINTERRUPTIBLE)))
9947 - read_unlock(&tasklist_lock);
9949 + return (nr_running() + nr_uninterruptible()) * FIXED_1;
9953 @@ -827,6 +801,89 @@
9957 +static void process_timeout(unsigned long __data)
9959 + wake_up_process((task_t *)__data);
9963 + * schedule_timeout - sleep until timeout
9964 + * @timeout: timeout value in jiffies
9966 + * Make the current task sleep until @timeout jiffies have
9967 + * elapsed. The routine will return immediately unless
9968 + * the current task state has been set (see set_current_state()).
9970 + * You can set the task state as follows -
9972 + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
9973 + * pass before the routine returns. The routine will return 0
9975 + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
9976 + * delivered to the current task. In this case the remaining time
9977 + * in jiffies will be returned, or 0 if the timer expired in time
9979 + * The current task state is guaranteed to be TASK_RUNNING when this
9980 + * routine returns.
9982 + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
9983 + * the CPU away without a bound on the timeout. In this case the return
9984 + * value will be %MAX_SCHEDULE_TIMEOUT.
9986 + * In all cases the return value is guaranteed to be non-negative.
9988 +signed long schedule_timeout(signed long timeout)
9990 + struct timer_list timer;
9991 + unsigned long expire;
9995 + case MAX_SCHEDULE_TIMEOUT:
9997 + * These two special cases are useful to be comfortable
9998 + * in the caller. Nothing more. We could take
9999 + * MAX_SCHEDULE_TIMEOUT from one of the negative value
10000 + * but I' d like to return a valid offset (>=0) to allow
10001 + * the caller to do everything it want with the retval.
10007 + * Another bit of PARANOID. Note that the retval will be
10008 + * 0 since no piece of kernel is supposed to do a check
10009 + * for a negative retval of schedule_timeout() (since it
10010 + * should never happens anyway). You just have the printk()
10011 + * that will tell you if something is gone wrong and where.
10015 + printk(KERN_ERR "schedule_timeout: wrong timeout "
10016 + "value %lx from %p\n", timeout,
10017 + __builtin_return_address(0));
10018 + current->state = TASK_RUNNING;
10023 + expire = timeout + jiffies;
10025 + init_timer(&timer);
10026 + timer.expires = expire;
10027 + timer.data = (unsigned long) current;
10028 + timer.function = process_timeout;
10030 + add_timer(&timer);
10032 + del_timer_sync(&timer);
10034 + timeout = expire - jiffies;
10037 + return timeout < 0 ? 0 : timeout;
10040 /* Thread ID - the internal kernel "pid" */
10041 asmlinkage long sys_gettid(void)
10043 @@ -873,4 +930,3 @@
10048 diff -urN linux-2.4.24.org/mm/oom_kill.c linux-2.4.24/mm/oom_kill.c
10049 --- linux-2.4.24.org/mm/oom_kill.c 2004-02-04 20:47:28.626569974 +0100
10050 +++ linux-2.4.24/mm/oom_kill.c 2004-02-04 20:57:30.567369583 +0100
10052 * Niced processes are most likely less important, so double
10053 * their badness points.
10056 + if (task_nice(p) > 0)
10060 @@ -150,7 +150,7 @@
10061 * all the memory it needs. That way it should be able to
10062 * exit() and clear out its resources quickly...
10064 - p->counter = 5 * HZ;
10065 + p->time_slice = HZ;
10066 p->flags |= PF_MEMALLOC | PF_MEMDIE;
10068 /* This process has hardware access, be more careful. */
10069 diff -urN linux-2.4.24.org/net/bluetooth/bnep/core.c linux-2.4.24/net/bluetooth/bnep/core.c
10070 --- linux-2.4.24.org/net/bluetooth/bnep/core.c 2004-02-04 20:48:41.535404904 +0100
10071 +++ linux-2.4.24/net/bluetooth/bnep/core.c 2004-02-04 20:52:55.199643577 +0100
10072 @@ -460,7 +460,7 @@
10073 sigfillset(¤t->blocked);
10074 flush_signals(current);
10076 - current->nice = -15;
10077 + set_user_nice(current, -15);
10081 diff -urN linux-2.4.24.org/net/bluetooth/cmtp/core.c linux-2.4.24/net/bluetooth/cmtp/core.c
10082 --- linux-2.4.24.org/net/bluetooth/cmtp/core.c 2004-02-04 20:48:41.311451486 +0100
10083 +++ linux-2.4.24/net/bluetooth/cmtp/core.c 2004-02-04 20:52:55.224638378 +0100
10084 @@ -298,7 +298,7 @@
10085 sigfillset(¤t->blocked);
10086 flush_signals(current);
10088 - current->nice = -15;
10089 + set_user_nice(current, -15);