1 diff -urN linux-2.4.22.org/arch/alpha/kernel/entry.S linux-2.4.22/arch/alpha/kernel/entry.S
2 --- linux-2.4.22.org/arch/alpha/kernel/entry.S 2003-11-24 18:29:46.000000000 +0100
3 +++ linux-2.4.22/arch/alpha/kernel/entry.S 2003-11-24 18:39:02.000000000 +0100
6 lda $26,ret_from_sys_call
14 diff -urN linux-2.4.22.org/arch/alpha/kernel/process.c linux-2.4.22/arch/alpha/kernel/process.c
15 --- linux-2.4.22.org/arch/alpha/kernel/process.c 2003-11-24 18:29:46.000000000 +0100
16 +++ linux-2.4.22/arch/alpha/kernel/process.c 2003-11-24 18:39:02.000000000 +0100
20 /* An endless idle loop with no priority at all. */
22 - current->counter = -100;
25 /* FIXME -- EV6 and LCA45 know how to power down
27 diff -urN linux-2.4.22.org/arch/alpha/kernel/smp.c linux-2.4.22/arch/alpha/kernel/smp.c
28 --- linux-2.4.22.org/arch/alpha/kernel/smp.c 2003-11-24 18:29:46.000000000 +0100
29 +++ linux-2.4.22/arch/alpha/kernel/smp.c 2003-11-24 18:39:02.000000000 +0100
31 int smp_num_probed; /* Internal processor count */
32 int smp_num_cpus = 1; /* Number that came online. */
33 int smp_threads_ready; /* True once the per process idle is forked. */
34 +cycles_t cacheflush_time;
35 +unsigned long cache_decay_ticks;
37 int __cpu_number_map[NR_CPUS];
38 int __cpu_logical_map[NR_CPUS];
41 int cpuid = hard_smp_processor_id();
43 - if (current != init_tasks[cpu_number_map(cpuid)]) {
44 - printk("BUG: smp_calling: cpu %d current %p init_tasks[cpu_number_map(cpuid)] %p\n",
45 - cpuid, current, init_tasks[cpu_number_map(cpuid)]);
48 DBGS(("CALLIN %d state 0x%lx\n", cpuid, current->state));
50 /* Turn on machine checks. */
52 DBGS(("smp_callin: commencing CPU %d current %p\n",
55 - /* Setup the scheduler for this processor. */
58 /* ??? This should be in init_idle. */
59 atomic_inc(&init_mm.mm_count);
60 current->active_mm = &init_mm;
67 + * Rough estimation for SMP scheduling, this is the number of cycles it
68 + * takes for a fully memory-limited process to flush the SMP-local cache.
70 + * We are not told how much cache there is, so we have to guess.
73 +smp_tune_scheduling (int cpuid)
75 + struct percpu_struct *cpu;
76 + unsigned long on_chip_cache; /* kB */
77 + unsigned long freq; /* Hz */
78 + unsigned long bandwidth = 350; /* MB/s */
80 + cpu = (struct percpu_struct*)((char*)hwrpb + hwrpb->processor_offset
81 + + cpuid * hwrpb->processor_size);
85 + on_chip_cache = 16 + 16;
90 + on_chip_cache = 8 + 8 + 96;
94 + on_chip_cache = 16 + 8;
100 + on_chip_cache = 64 + 64;
104 + freq = hwrpb->cycle_freq ? : est_cycle_freq;
106 + cacheflush_time = (freq / 1000000) * (on_chip_cache << 10) / bandwidth;
107 + cache_decay_ticks = cacheflush_time / (freq / 1000) * HZ / 1000;
109 + printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
110 + cacheflush_time/(freq/1000000),
111 + (cacheflush_time*100/(freq/1000000)) % 100);
112 + printk("task migration cache decay timeout: %ld msecs.\n",
113 + (cache_decay_ticks + 1) * 1000 / HZ);
117 * Send a message to a secondary's console. "START" is one such
118 * interesting message. ;-)
119 @@ -449,14 +494,11 @@
120 if (idle == &init_task)
121 panic("idle process is init_task for CPU %d", cpuid);
123 - idle->processor = cpuid;
124 - idle->cpus_runnable = 1 << cpuid; /* we schedule the first task manually */
125 + init_idle(idle, cpuid);
126 + unhash_process(idle);
128 __cpu_logical_map[cpunum] = cpuid;
129 __cpu_number_map[cpuid] = cpunum;
131 - del_from_runqueue(idle);
132 - unhash_process(idle);
133 - init_tasks[cpunum] = idle;
135 DBGS(("smp_boot_one_cpu: CPU %d state 0x%lx flags 0x%lx\n",
136 cpuid, idle->state, idle->flags));
137 @@ -563,13 +605,11 @@
139 __cpu_number_map[boot_cpuid] = 0;
140 __cpu_logical_map[0] = boot_cpuid;
141 - current->processor = boot_cpuid;
143 smp_store_cpu_info(boot_cpuid);
144 + smp_tune_scheduling(boot_cpuid);
145 smp_setup_percpu_timer(boot_cpuid);
149 /* ??? This should be in init_idle. */
150 atomic_inc(&init_mm.mm_count);
151 current->active_mm = &init_mm;
152 diff -urN linux-2.4.22.org/arch/arm/kernel/process.c linux-2.4.22/arch/arm/kernel/process.c
153 --- linux-2.4.22.org/arch/arm/kernel/process.c 2003-11-24 18:30:05.000000000 +0100
154 +++ linux-2.4.22/arch/arm/kernel/process.c 2003-11-24 18:39:02.000000000 +0100
157 /* endless idle loop with no priority at all */
159 - current->nice = 20;
160 - current->counter = -100;
163 void (*idle)(void) = pm_idle;
164 diff -urN linux-2.4.22.org/arch/i386/kernel/entry.S linux-2.4.22/arch/i386/kernel/entry.S
165 --- linux-2.4.22.org/arch/i386/kernel/entry.S 2003-11-24 18:29:45.000000000 +0100
166 +++ linux-2.4.22/arch/i386/kernel/entry.S 2003-11-24 18:39:02.000000000 +0100
182 call SYMBOL_NAME(schedule_tail)
186 testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS
188 diff -urN linux-2.4.22.org/arch/i386/kernel/process.c linux-2.4.22/arch/i386/kernel/process.c
189 --- linux-2.4.22.org/arch/i386/kernel/process.c 2003-11-24 18:29:45.000000000 +0100
190 +++ linux-2.4.22/arch/i386/kernel/process.c 2003-11-24 18:39:02.000000000 +0100
193 if (current_cpu_data.hlt_works_ok && !hlt_counter) {
195 - if (!current->need_resched)
196 + if (!need_resched())
203 /* endless idle loop with no priority at all */
205 - current->nice = 20;
206 - current->counter = -100;
209 void (*idle)(void) = pm_idle;
210 @@ -708,15 +705,17 @@
211 asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
214 - * Restore %fs and %gs.
215 + * Restore %fs and %gs if needed.
217 - loadsegment(fs, next->fs);
218 - loadsegment(gs, next->gs);
219 + if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) {
220 + loadsegment(fs, next->fs);
221 + loadsegment(gs, next->gs);
225 * Now maybe reload the debug registers
227 - if (next->debugreg[7]){
228 + if (unlikely(next->debugreg[7])) {
236 - if (prev->ioperm || next->ioperm) {
237 + if (unlikely(prev->ioperm || next->ioperm)) {
240 * 4 cachelines copy ... not good, but not that
241 diff -urN linux-2.4.22.org/arch/i386/kernel/setup.c linux-2.4.22/arch/i386/kernel/setup.c
242 --- linux-2.4.22.org/arch/i386/kernel/setup.c 2003-11-24 18:29:45.000000000 +0100
243 +++ linux-2.4.22/arch/i386/kernel/setup.c 2003-11-24 18:39:02.000000000 +0100
244 @@ -3190,9 +3190,10 @@
249 - * Clear all 6 debug registers:
251 + /* Clear %fs and %gs. */
252 + asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
254 + /* Clear all 6 debug registers: */
256 #define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) );
258 diff -urN linux-2.4.22.org/arch/i386/kernel/smpboot.c linux-2.4.22/arch/i386/kernel/smpboot.c
259 --- linux-2.4.22.org/arch/i386/kernel/smpboot.c 2003-11-24 18:29:45.000000000 +0100
260 +++ linux-2.4.22/arch/i386/kernel/smpboot.c 2003-11-24 18:39:02.000000000 +0100
261 @@ -308,14 +308,14 @@
262 if (tsc_values[i] < avg)
263 realdelta = -realdelta;
265 - printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n",
267 + printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", i, realdelta);
277 static void __init synchronize_tsc_ap (void)
279 * (This works even if the APIC is not enabled.)
281 phys_id = GET_APIC_ID(apic_read(APIC_ID));
282 - cpuid = current->processor;
284 if (test_and_set_bit(cpuid, &cpu_online_map)) {
285 printk("huh, phys CPU#%d, CPU#%d already present??\n",
289 smp_store_cpu_info(cpuid);
291 + disable_APIC_timer();
293 * Allow the master to continue.
297 while (!atomic_read(&smp_commenced))
299 + enable_APIC_timer();
301 * low-memory mappings have been cleared, flush them from
302 * the local TLBs too.
303 @@ -803,16 +805,13 @@
305 panic("No idle process for CPU %d", cpu);
307 - idle->processor = cpu;
308 - idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */
309 + init_idle(idle, cpu);
311 map_cpu_to_boot_apicid(cpu, apicid);
313 idle->thread.eip = (unsigned long) start_secondary;
315 - del_from_runqueue(idle);
316 unhash_process(idle);
317 - init_tasks[cpu] = idle;
319 /* start_eip had better be page-aligned! */
320 start_eip = setup_trampoline();
324 cycles_t cacheflush_time;
325 +unsigned long cache_decay_ticks;
327 static void smp_tune_scheduling (void)
330 cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth;
333 + cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000;
335 printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
336 (long)cacheflush_time/(cpu_khz/1000),
337 ((long)cacheflush_time*100/(cpu_khz/1000)) % 100);
338 + printk("task migration cache decay timeout: %ld msecs.\n",
339 + (cache_decay_ticks + 1) * 1000 / HZ);
343 @@ -1026,8 +1030,7 @@
344 map_cpu_to_boot_apicid(0, boot_cpu_apicid);
346 global_irq_holder = 0;
347 - current->processor = 0;
350 smp_tune_scheduling();
353 diff -urN linux-2.4.22.org/arch/i386/kernel/smp.c linux-2.4.22/arch/i386/kernel/smp.c
354 --- linux-2.4.22.org/arch/i386/kernel/smp.c 2003-11-24 18:29:45.000000000 +0100
355 +++ linux-2.4.22/arch/i386/kernel/smp.c 2003-11-24 18:39:02.000000000 +0100
356 @@ -496,13 +496,23 @@
357 * it goes straight through and wastes no time serializing
358 * anything. Worst case is that we lose a reschedule ...
361 void smp_send_reschedule(int cpu)
363 send_IPI_mask(1 << cpu, RESCHEDULE_VECTOR);
367 + * this function sends a reschedule IPI to all (other) CPUs.
368 + * This should only be used if some 'global' task became runnable,
369 + * such as a RT task, that must be handled now. The first CPU
370 + * that manages to grab the task will run it.
372 +void smp_send_reschedule_all(void)
374 + send_IPI_allbutself(RESCHEDULE_VECTOR);
378 * Structure and data for smp_call_function(). This is designed to minimise
379 * static memory requirements. It also looks cleaner.
381 diff -urN linux-2.4.22.org/arch/mips64/kernel/process.c linux-2.4.22/arch/mips64/kernel/process.c
382 --- linux-2.4.22.org/arch/mips64/kernel/process.c 2003-11-24 18:30:12.000000000 +0100
383 +++ linux-2.4.22/arch/mips64/kernel/process.c 2003-11-24 18:39:02.000000000 +0100
386 /* endless idle loop with no priority at all */
388 - current->nice = 20;
389 - current->counter = -100;
392 while (!current->need_resched)
394 diff -urN linux-2.4.22.org/arch/parisc/kernel/process.c linux-2.4.22/arch/parisc/kernel/process.c
395 --- linux-2.4.22.org/arch/parisc/kernel/process.c 2003-11-24 18:30:13.000000000 +0100
396 +++ linux-2.4.22/arch/parisc/kernel/process.c 2003-11-24 18:39:02.000000000 +0100
399 /* endless idle loop with no priority at all */
401 - current->nice = 20;
402 - current->counter = -100;
405 while (!current->need_resched) {
406 diff -urN linux-2.4.22.org/arch/ppc/8260_io/uart.c linux-2.4.22/arch/ppc/8260_io/uart.c
407 --- linux-2.4.22.org/arch/ppc/8260_io/uart.c 2003-11-24 18:30:02.000000000 +0100
408 +++ linux-2.4.22/arch/ppc/8260_io/uart.c 2003-11-24 18:39:02.000000000 +0100
409 @@ -1801,7 +1801,6 @@
410 printk("lsr = %d (jiff=%lu)...", lsr, jiffies);
412 current->state = TASK_INTERRUPTIBLE;
413 -/* current->counter = 0; make us low-priority */
414 schedule_timeout(char_time);
415 if (signal_pending(current))
417 diff -urN linux-2.4.22.org/arch/ppc/8xx_io/uart.c linux-2.4.22/arch/ppc/8xx_io/uart.c
418 --- linux-2.4.22.org/arch/ppc/8xx_io/uart.c 2003-11-24 18:30:01.000000000 +0100
419 +++ linux-2.4.22/arch/ppc/8xx_io/uart.c 2003-11-24 18:39:02.000000000 +0100
420 @@ -1856,7 +1856,6 @@
421 printk("lsr = %d (jiff=%lu)...", lsr, jiffies);
423 current->state = TASK_INTERRUPTIBLE;
424 -/* current->counter = 0; make us low-priority */
425 schedule_timeout(char_time);
426 if (signal_pending(current))
428 diff -urN linux-2.4.22.org/arch/ppc/kernel/entry.S linux-2.4.22/arch/ppc/kernel/entry.S
429 --- linux-2.4.22.org/arch/ppc/kernel/entry.S 2003-11-24 18:29:55.000000000 +0100
430 +++ linux-2.4.22/arch/ppc/kernel/entry.S 2003-11-24 18:39:02.000000000 +0100
438 lwz r0,TASK_PTRACE(r2)
439 andi. r0,r0,PT_TRACESYS
441 diff -urN linux-2.4.22.org/arch/ppc/kernel/idle.c linux-2.4.22/arch/ppc/kernel/idle.c
442 --- linux-2.4.22.org/arch/ppc/kernel/idle.c 2003-11-24 18:29:55.000000000 +0100
443 +++ linux-2.4.22/arch/ppc/kernel/idle.c 2003-11-24 18:39:02.000000000 +0100
447 /* endless loop with no priority at all */
448 - current->nice = 20;
449 - current->counter = -100;
454 if (!do_power_save) {
455 diff -urN linux-2.4.22.org/arch/ppc/kernel/mk_defs.c linux-2.4.22/arch/ppc/kernel/mk_defs.c
456 --- linux-2.4.22.org/arch/ppc/kernel/mk_defs.c 2003-11-24 18:29:55.000000000 +0100
457 +++ linux-2.4.22/arch/ppc/kernel/mk_defs.c 2003-11-24 18:39:02.000000000 +0100
459 /*DEFINE(KERNELBASE, KERNELBASE);*/
460 DEFINE(STATE, offsetof(struct task_struct, state));
461 DEFINE(NEXT_TASK, offsetof(struct task_struct, next_task));
462 - DEFINE(COUNTER, offsetof(struct task_struct, counter));
463 - DEFINE(PROCESSOR, offsetof(struct task_struct, processor));
464 + DEFINE(COUNTER, offsetof(struct task_struct, time_slice));
465 + DEFINE(PROCESSOR, offsetof(struct task_struct, cpu));
466 DEFINE(SIGPENDING, offsetof(struct task_struct, sigpending));
467 DEFINE(THREAD, offsetof(struct task_struct, thread));
468 DEFINE(MM, offsetof(struct task_struct, mm));
469 diff -urN linux-2.4.22.org/arch/ppc/kernel/process.c linux-2.4.22/arch/ppc/kernel/process.c
470 --- linux-2.4.22.org/arch/ppc/kernel/process.c 2003-11-24 18:29:54.000000000 +0100
471 +++ linux-2.4.22/arch/ppc/kernel/process.c 2003-11-24 18:39:02.000000000 +0100
476 - printk(" CPU: %d", current->processor);
477 + printk(" CPU: %d", current->cpu);
478 #endif /* CONFIG_SMP */
481 diff -urN linux-2.4.22.org/arch/ppc/kernel/smp.c linux-2.4.22/arch/ppc/kernel/smp.c
482 --- linux-2.4.22.org/arch/ppc/kernel/smp.c 2003-11-24 18:29:54.000000000 +0100
483 +++ linux-2.4.22/arch/ppc/kernel/smp.c 2003-11-24 18:39:02.000000000 +0100
485 unsigned long cpu_online_map;
486 int smp_hw_index[NR_CPUS];
487 static struct smp_ops_t *smp_ops;
488 +unsigned long cache_decay_ticks = HZ/100;
490 /* all cpu mappings are 1-1 -- Cort */
491 volatile unsigned long cpu_callin_map[NR_CPUS];
493 * cpu 0, the master -- Cort
495 cpu_callin_map[0] = 1;
496 - current->processor = 0;
501 for (i = 0; i < NR_CPUS; i++) {
504 p = init_task.prev_task;
506 panic("No idle task for CPU %d", i);
507 - del_from_runqueue(p);
513 - p->cpus_runnable = 1 << i; /* we schedule the first task manually */
519 void __init smp_callin(void)
521 - int cpu = current->processor;
522 + int cpu = current->cpu;
524 smp_store_cpu_info(cpu);
525 smp_ops->setup_cpu(cpu);
526 diff -urN linux-2.4.22.org/arch/ppc/lib/dec_and_lock.c linux-2.4.22/arch/ppc/lib/dec_and_lock.c
527 --- linux-2.4.22.org/arch/ppc/lib/dec_and_lock.c 2003-11-24 18:30:01.000000000 +0100
528 +++ linux-2.4.22/arch/ppc/lib/dec_and_lock.c 2003-11-24 18:39:02.000000000 +0100
530 #include <linux/module.h>
531 +#include <linux/sched.h>
532 #include <linux/spinlock.h>
533 #include <asm/atomic.h>
534 #include <asm/system.h>
535 diff -urN linux-2.4.22.org/arch/ppc/mm/init.c linux-2.4.22/arch/ppc/mm/init.c
536 --- linux-2.4.22.org/arch/ppc/mm/init.c 2003-11-24 18:29:54.000000000 +0100
537 +++ linux-2.4.22/arch/ppc/mm/init.c 2003-11-24 18:39:02.000000000 +0100
542 - printk("%3d ", p->processor);
543 - if ( (p->processor != NO_PROC_ID) &&
544 - (p == current_set[p->processor]) )
545 + printk("%3d ", p->cpu);
546 + if ( (p->cpu != NO_PROC_ID) &&
547 + (p == current_set[p->cpu]) )
551 diff -urN linux-2.4.22.org/arch/ppc64/kernel/entry.S linux-2.4.22/arch/ppc64/kernel/entry.S
552 --- linux-2.4.22.org/arch/ppc64/kernel/entry.S 2003-11-24 18:29:44.000000000 +0100
553 +++ linux-2.4.22/arch/ppc64/kernel/entry.S 2003-11-24 18:39:02.000000000 +0100
557 _GLOBAL(ret_from_fork)
561 ld r0,TASK_PTRACE(r13)
562 andi. r0,r0,PT_TRACESYS
563 beq+ .ret_from_except
564 diff -urN linux-2.4.22.org/arch/ppc64/kernel/idle.c linux-2.4.22/arch/ppc64/kernel/idle.c
565 --- linux-2.4.22.org/arch/ppc64/kernel/idle.c 2003-11-24 18:29:44.000000000 +0100
566 +++ linux-2.4.22/arch/ppc64/kernel/idle.c 2003-11-24 18:39:02.000000000 +0100
571 - /* endless loop with no priority at all */
572 - current->nice = 20;
573 - current->counter = -100;
574 #ifdef CONFIG_PPC_ISERIES
575 /* ensure iSeries run light will be out when idle */
576 current->thread.flags &= ~PPC_FLAG_RUN_LIGHT;
582 + /* endless loop with no priority at all */
586 diff -urN linux-2.4.22.org/arch/ppc64/kernel/process.c linux-2.4.22/arch/ppc64/kernel/process.c
587 --- linux-2.4.22.org/arch/ppc64/kernel/process.c 2003-11-24 18:29:43.000000000 +0100
588 +++ linux-2.4.22/arch/ppc64/kernel/process.c 2003-11-24 18:39:02.000000000 +0100
590 #ifdef SHOW_TASK_SWITCHES
591 printk("%s/%d -> %s/%d NIP %08lx cpu %d root %x/%x\n",
592 prev->comm,prev->pid,
593 - new->comm,new->pid,new->thread.regs->nip,new->processor,
594 + new->comm,new->pid,new->thread.regs->nip,new->cpu,
595 new->fs->root,prev->fs->root);
598 diff -urN linux-2.4.22.org/arch/ppc64/kernel/smp.c linux-2.4.22/arch/ppc64/kernel/smp.c
599 --- linux-2.4.22.org/arch/ppc64/kernel/smp.c 2003-11-24 18:29:44.000000000 +0100
600 +++ linux-2.4.22/arch/ppc64/kernel/smp.c 2003-11-24 18:39:02.000000000 +0100
602 extern atomic_t ipi_sent;
603 spinlock_t kernel_flag __cacheline_aligned = SPIN_LOCK_UNLOCKED;
604 cycles_t cacheflush_time;
605 +unsigned long cache_decay_ticks = HZ/100;
606 static int max_cpus __initdata = NR_CPUS;
608 unsigned long cpu_online_map;
610 * cpu 0, the master -- Cort
612 cpu_callin_map[0] = 1;
613 - current->processor = 0;
618 for (i = 0; i < NR_CPUS; i++) {
619 paca[i].prof_counter = 1;
622 PPCDBG(PPCDBG_SMP,"\tProcessor %d, task = 0x%lx\n", i, p);
624 - del_from_runqueue(p);
630 - p->cpus_runnable = 1 << i; /* we schedule the first task manually */
631 current_set[i].task = p;
632 sp = ((unsigned long)p) + sizeof(union task_union)
633 - STACK_FRAME_OVERHEAD;
636 void __init smp_callin(void)
638 - int cpu = current->processor;
639 + int cpu = current->cpu;
641 smp_store_cpu_info(cpu);
642 set_dec(paca[cpu].default_decr);
645 ppc_md.smp_setup_cpu(cpu);
649 set_bit(smp_processor_id(), &cpu_online_map);
651 while(!smp_commenced) {
656 - cpu = current->processor;
657 + cpu = current->cpu;
658 atomic_inc(&init_mm.mm_count);
659 current->active_mm = &init_mm;
661 diff -urN linux-2.4.22.org/arch/s390/kernel/process.c linux-2.4.22/arch/s390/kernel/process.c
662 --- linux-2.4.22.org/arch/s390/kernel/process.c 2003-11-24 18:30:13.000000000 +0100
663 +++ linux-2.4.22/arch/s390/kernel/process.c 2003-11-24 18:39:02.000000000 +0100
666 /* endless idle loop with no priority at all */
668 - current->nice = 20;
669 - current->counter = -100;
672 if (current->need_resched) {
674 diff -urN linux-2.4.22.org/arch/s390x/kernel/process.c linux-2.4.22/arch/s390x/kernel/process.c
675 --- linux-2.4.22.org/arch/s390x/kernel/process.c 2003-11-24 18:30:19.000000000 +0100
676 +++ linux-2.4.22/arch/s390x/kernel/process.c 2003-11-24 18:39:02.000000000 +0100
679 /* endless idle loop with no priority at all */
681 - current->nice = 20;
682 - current->counter = -100;
685 if (current->need_resched) {
687 diff -urN linux-2.4.22.org/arch/sh/kernel/process.c linux-2.4.22/arch/sh/kernel/process.c
688 --- linux-2.4.22.org/arch/sh/kernel/process.c 2003-11-24 18:30:10.000000000 +0100
689 +++ linux-2.4.22/arch/sh/kernel/process.c 2003-11-24 18:39:02.000000000 +0100
692 /* endless idle loop with no priority at all */
694 - current->nice = 20;
695 - current->counter = -100;
699 diff -urN linux-2.4.22.org/arch/sparc/kernel/entry.S linux-2.4.22/arch/sparc/kernel/entry.S
700 --- linux-2.4.22.org/arch/sparc/kernel/entry.S 2003-11-24 18:29:50.000000000 +0100
701 +++ linux-2.4.22/arch/sparc/kernel/entry.S 2003-11-24 18:39:02.000000000 +0100
702 @@ -1471,7 +1471,9 @@
704 .globl C_LABEL(ret_from_fork)
705 C_LABEL(ret_from_fork):
710 b C_LABEL(ret_sys_call)
711 ld [%sp + STACKFRAME_SZ + PT_I0], %o0
712 diff -urN linux-2.4.22.org/arch/sparc/kernel/process.c linux-2.4.22/arch/sparc/kernel/process.c
713 --- linux-2.4.22.org/arch/sparc/kernel/process.c 2003-11-24 18:29:50.000000000 +0100
714 +++ linux-2.4.22/arch/sparc/kernel/process.c 2003-11-24 18:39:02.000000000 +0100
718 /* endless idle loop with no priority at all */
719 - current->nice = 20;
720 - current->counter = -100;
724 if (ARCH_SUN4C_SUN4) {
728 /* endless idle loop with no priority at all */
729 - current->nice = 20;
730 - current->counter = -100;
734 if(current->need_resched) {
735 diff -urN linux-2.4.22.org/arch/sparc/kernel/smp.c linux-2.4.22/arch/sparc/kernel/smp.c
736 --- linux-2.4.22.org/arch/sparc/kernel/smp.c 2003-11-24 18:29:50.000000000 +0100
737 +++ linux-2.4.22/arch/sparc/kernel/smp.c 2003-11-24 18:39:02.000000000 +0100
739 volatile int __cpu_number_map[NR_CPUS];
740 volatile int __cpu_logical_map[NR_CPUS];
741 cycles_t cacheflush_time = 0; /* XXX */
742 +unsigned long cache_decay_ticks = HZ/100; /* XXX */
744 /* The only guaranteed locking primitive available on all Sparc
745 * processors is 'ldstub [%reg + immediate], %dest_reg' which atomically
746 diff -urN linux-2.4.22.org/arch/sparc/kernel/sun4d_smp.c linux-2.4.22/arch/sparc/kernel/sun4d_smp.c
747 --- linux-2.4.22.org/arch/sparc/kernel/sun4d_smp.c 2003-11-24 18:29:50.000000000 +0100
748 +++ linux-2.4.22/arch/sparc/kernel/sun4d_smp.c 2003-11-24 18:39:02.000000000 +0100
750 * the SMP initialization the master will be just allowed
751 * to call the scheduler code.
755 /* Get our local ticker going. */
756 smp_setup_percpu_timer();
758 while((unsigned long)current_set[cpuid] < PAGE_OFFSET)
761 - while(current_set[cpuid]->processor != cpuid)
762 + while(current_set[cpuid]->cpu != cpuid)
765 /* Fix idle thread fields. */
768 __cpu_number_map[boot_cpu_id] = 0;
769 __cpu_logical_map[0] = boot_cpu_id;
770 - current->processor = boot_cpu_id;
771 smp_store_cpu_info(boot_cpu_id);
772 smp_setup_percpu_timer();
774 local_flush_cache_all();
775 if(linux_num_cpus == 1)
776 return; /* Not an MP box. */
777 @@ -222,14 +219,10 @@
780 p = init_task.prev_task;
784 - p->cpus_runnable = 1 << i; /* we schedule the first task manually */
788 - del_from_runqueue(p);
792 for (no = 0; no < linux_num_cpus; no++)
793 diff -urN linux-2.4.22.org/arch/sparc/kernel/sun4m_smp.c linux-2.4.22/arch/sparc/kernel/sun4m_smp.c
794 --- linux-2.4.22.org/arch/sparc/kernel/sun4m_smp.c 2003-11-24 18:29:50.000000000 +0100
795 +++ linux-2.4.22/arch/sparc/kernel/sun4m_smp.c 2003-11-24 18:39:02.000000000 +0100
797 * the SMP initialization the master will be just allowed
798 * to call the scheduler code.
802 /* Allow master to continue. */
803 swap((unsigned long *)&cpu_callin_map[cpuid], 1);
804 @@ -170,12 +169,10 @@
805 mid_xlate[boot_cpu_id] = (linux_cpus[boot_cpu_id].mid & ~8);
806 __cpu_number_map[boot_cpu_id] = 0;
807 __cpu_logical_map[0] = boot_cpu_id;
808 - current->processor = boot_cpu_id;
810 smp_store_cpu_info(boot_cpu_id);
811 set_irq_udt(mid_xlate[boot_cpu_id]);
812 smp_setup_percpu_timer();
814 local_flush_cache_all();
815 if(linux_num_cpus == 1)
816 return; /* Not an MP box. */
817 @@ -195,14 +192,10 @@
820 p = init_task.prev_task;
824 - p->cpus_runnable = 1 << i; /* we schedule the first task manually */
828 - del_from_runqueue(p);
832 /* See trampoline.S for details... */
833 diff -urN linux-2.4.22.org/arch/sparc64/kernel/entry.S linux-2.4.22/arch/sparc64/kernel/entry.S
834 --- linux-2.4.22.org/arch/sparc64/kernel/entry.S 2003-11-24 18:30:04.000000000 +0100
835 +++ linux-2.4.22/arch/sparc64/kernel/entry.S 2003-11-24 18:39:02.000000000 +0100
836 @@ -1627,7 +1627,9 @@
838 andn %o7, SPARC_FLAG_NEWCHILD, %l0
839 mov %g5, %o0 /* 'prev' */
843 stb %l0, [%g6 + AOFF_task_thread + AOFF_thread_flags]
844 andcc %l0, SPARC_FLAG_PERFCTR, %g0
846 diff -urN linux-2.4.22.org/arch/sparc64/kernel/irq.c linux-2.4.22/arch/sparc64/kernel/irq.c
847 --- linux-2.4.22.org/arch/sparc64/kernel/irq.c 2003-11-24 18:30:04.000000000 +0100
848 +++ linux-2.4.22/arch/sparc64/kernel/irq.c 2003-11-24 18:39:02.000000000 +0100
850 tid = ((tid & UPA_CONFIG_MID) << 9);
853 - tid = (starfire_translate(imap, current->processor) << 26);
854 + tid = (starfire_translate(imap, current->cpu) << 26);
858 diff -urN linux-2.4.22.org/arch/sparc64/kernel/process.c linux-2.4.22/arch/sparc64/kernel/process.c
859 --- linux-2.4.22.org/arch/sparc64/kernel/process.c 2003-11-24 18:30:04.000000000 +0100
860 +++ linux-2.4.22/arch/sparc64/kernel/process.c 2003-11-24 18:39:02.000000000 +0100
864 /* endless idle loop with no priority at all */
865 - current->nice = 20;
866 - current->counter = -100;
870 /* If current->need_resched is zero we should really
873 * the idle loop on a UltraMultiPenguin...
875 -#define idle_me_harder() (cpu_data[current->processor].idle_volume += 1)
876 -#define unidle_me() (cpu_data[current->processor].idle_volume = 0)
877 +#define idle_me_harder() (cpu_data[current->cpu].idle_volume += 1)
878 +#define unidle_me() (cpu_data[current->cpu].idle_volume = 0)
881 - current->nice = 20;
882 - current->counter = -100;
886 if (current->need_resched != 0) {
888 diff -urN linux-2.4.22.org/arch/sparc64/kernel/rtrap.S linux-2.4.22/arch/sparc64/kernel/rtrap.S
889 --- linux-2.4.22.org/arch/sparc64/kernel/rtrap.S 2003-11-24 18:30:04.000000000 +0100
890 +++ linux-2.4.22/arch/sparc64/kernel/rtrap.S 2003-11-24 18:39:02.000000000 +0100
893 .globl rtrap_clr_l6, rtrap, irqsz_patchme, rtrap_xcall
894 rtrap_clr_l6: clr %l6
895 -rtrap: lduw [%g6 + AOFF_task_processor], %l0
896 +rtrap: lduw [%g6 + AOFF_task_cpu], %l0
897 sethi %hi(irq_stat), %l2 ! &softirq_active
898 or %l2, %lo(irq_stat), %l2 ! &softirq_active
899 irqsz_patchme: sllx %l0, 0, %l0
900 diff -urN linux-2.4.22.org/arch/sparc64/kernel/smp.c linux-2.4.22/arch/sparc64/kernel/smp.c
901 --- linux-2.4.22.org/arch/sparc64/kernel/smp.c 2003-11-24 18:30:04.000000000 +0100
902 +++ linux-2.4.22/arch/sparc64/kernel/smp.c 2003-11-24 18:39:02.000000000 +0100
904 printk("Entering UltraSMPenguin Mode...\n");
906 smp_store_cpu_info(boot_cpu_id);
908 + smp_tune_scheduling();
910 if (linux_num_cpus == 1)
915 p = init_task.prev_task;
916 - init_tasks[cpucount] = p;
919 - p->cpus_runnable = 1UL << i; /* we schedule the first task manually */
921 - del_from_runqueue(p);
926 @@ -1214,10 +1210,96 @@
927 __cpu_number_map[boot_cpu_id] = 0;
928 prom_cpu_nodes[boot_cpu_id] = linux_cpus[0].prom_node;
929 __cpu_logical_map[0] = boot_cpu_id;
930 - current->processor = boot_cpu_id;
931 prof_counter(boot_cpu_id) = prof_multiplier(boot_cpu_id) = 1;
934 +cycles_t cacheflush_time;
935 +unsigned long cache_decay_ticks;
937 +extern unsigned long cheetah_tune_scheduling(void);
939 +static void __init smp_tune_scheduling(void)
941 + unsigned long orig_flush_base, flush_base, flags, *p;
942 + unsigned int ecache_size, order;
943 + cycles_t tick1, tick2, raw;
945 + /* Approximate heuristic for SMP scheduling. It is an
946 + * estimation of the time it takes to flush the L2 cache
947 + * on the local processor.
949 + * The ia32 chooses to use the L1 cache flush time instead,
950 + * and I consider this complete nonsense. The Ultra can service
951 + * a miss to the L1 with a hit to the L2 in 7 or 8 cycles, and
952 + * L2 misses are what create extra bus traffic (ie. the "cost"
953 + * of moving a process from one cpu to another).
955 + printk("SMP: Calibrating ecache flush... ");
956 + if (tlb_type == cheetah || tlb_type == cheetah_plus) {
957 + cacheflush_time = cheetah_tune_scheduling();
961 + ecache_size = prom_getintdefault(linux_cpus[0].prom_node,
962 + "ecache-size", (512 * 1024));
963 + if (ecache_size > (4 * 1024 * 1024))
964 + ecache_size = (4 * 1024 * 1024);
965 + orig_flush_base = flush_base =
966 + __get_free_pages(GFP_KERNEL, order = get_order(ecache_size));
968 + if (flush_base != 0UL) {
969 + local_irq_save(flags);
971 + /* Scan twice the size once just to get the TLB entries
972 + * loaded and make sure the second scan measures pure misses.
974 + for (p = (unsigned long *)flush_base;
975 + ((unsigned long)p) < (flush_base + (ecache_size<<1));
976 + p += (64 / sizeof(unsigned long)))
977 + *((volatile unsigned long *)p);
979 + tick1 = tick_ops->get_tick();
981 + __asm__ __volatile__("1:\n\t"
982 + "ldx [%0 + 0x000], %%g1\n\t"
983 + "ldx [%0 + 0x040], %%g2\n\t"
984 + "ldx [%0 + 0x080], %%g3\n\t"
985 + "ldx [%0 + 0x0c0], %%g5\n\t"
986 + "add %0, 0x100, %0\n\t"
988 + "bne,pt %%xcc, 1b\n\t"
990 + : "=&r" (flush_base)
991 + : "0" (flush_base),
992 + "r" (flush_base + ecache_size)
993 + : "g1", "g2", "g3", "g5");
995 + tick2 = tick_ops->get_tick();
997 + local_irq_restore(flags);
999 + raw = (tick2 - tick1);
1001 + /* Dampen it a little, considering two processes
1002 + * sharing the cache and fitting.
1004 + cacheflush_time = (raw - (raw >> 2));
1006 + free_pages(orig_flush_base, order);
1008 + cacheflush_time = ((ecache_size << 2) +
1009 + (ecache_size << 1));
1012 + /* Convert ticks/sticks to jiffies. */
1013 + cache_decay_ticks = cacheflush_time / timer_tick_offset;
1014 + if (cache_decay_ticks < 1)
1015 + cache_decay_ticks = 1;
1017 + printk("Using heuristic of %ld cycles, %ld ticks.\n",
1018 + cacheflush_time, cache_decay_ticks);
1021 static inline unsigned long find_flush_base(unsigned long size)
1023 struct page *p = mem_map;
1024 diff -urN linux-2.4.22.org/arch/sparc64/kernel/trampoline.S linux-2.4.22/arch/sparc64/kernel/trampoline.S
1025 --- linux-2.4.22.org/arch/sparc64/kernel/trampoline.S 2003-11-24 18:30:04.000000000 +0100
1026 +++ linux-2.4.22/arch/sparc64/kernel/trampoline.S 2003-11-24 18:39:02.000000000 +0100
1028 wrpr %o1, PSTATE_IG, %pstate
1030 /* Get our UPA MID. */
1031 - lduw [%o2 + AOFF_task_processor], %g1
1032 + lduw [%o2 + AOFF_task_cpu], %g1
1033 sethi %hi(cpu_data), %g5
1034 or %g5, %lo(cpu_data), %g5
1036 diff -urN linux-2.4.22.org/arch/sparc64/kernel/traps.c linux-2.4.22/arch/sparc64/kernel/traps.c
1037 --- linux-2.4.22.org/arch/sparc64/kernel/traps.c 2003-11-24 18:30:04.000000000 +0100
1038 +++ linux-2.4.22/arch/sparc64/kernel/traps.c 2003-11-24 18:39:02.000000000 +0100
1040 #include <linux/smp.h>
1041 #include <linux/smp_lock.h>
1042 #include <linux/mm.h>
1043 +#include <linux/init.h>
1045 #include <asm/delay.h>
1046 #include <asm/system.h>
1047 @@ -755,6 +756,48 @@
1048 "i" (ASI_PHYS_USE_EC));
1052 +unsigned long __init cheetah_tune_scheduling(void)
1054 + unsigned long tick1, tick2, raw;
1055 + unsigned long flush_base = ecache_flush_physbase;
1056 + unsigned long flush_linesize = ecache_flush_linesize;
1057 + unsigned long flush_size = ecache_flush_size;
1059 + /* Run through the whole cache to guarentee the timed loop
1060 + * is really displacing cache lines.
1062 + __asm__ __volatile__("1: subcc %0, %4, %0\n\t"
1063 + " bne,pt %%xcc, 1b\n\t"
1064 + " ldxa [%2 + %0] %3, %%g0\n\t"
1065 + : "=&r" (flush_size)
1066 + : "0" (flush_size), "r" (flush_base),
1067 + "i" (ASI_PHYS_USE_EC), "r" (flush_linesize));
1069 + /* The flush area is 2 X Ecache-size, so cut this in half for
1072 + flush_base = ecache_flush_physbase;
1073 + flush_linesize = ecache_flush_linesize;
1074 + flush_size = ecache_flush_size >> 1;
1076 + __asm__ __volatile__("rd %%tick, %0" : "=r" (tick1));
1078 + __asm__ __volatile__("1: subcc %0, %4, %0\n\t"
1079 + " bne,pt %%xcc, 1b\n\t"
1080 + " ldxa [%2 + %0] %3, %%g0\n\t"
1081 + : "=&r" (flush_size)
1082 + : "0" (flush_size), "r" (flush_base),
1083 + "i" (ASI_PHYS_USE_EC), "r" (flush_linesize));
1085 + __asm__ __volatile__("rd %%tick, %0" : "=r" (tick2));
1087 + raw = (tick2 - tick1);
1089 + return (raw - (raw >> 2));
1093 /* Unfortunately, the diagnostic access to the I-cache tags we need to
1094 * use to clear the thing interferes with I-cache coherency transactions.
1096 diff -urN linux-2.4.22.org/Documentation/sched-coding.txt linux-2.4.22/Documentation/sched-coding.txt
1097 --- linux-2.4.22.org/Documentation/sched-coding.txt 1970-01-01 01:00:00.000000000 +0100
1098 +++ linux-2.4.22/Documentation/sched-coding.txt 2003-11-24 18:39:02.000000000 +0100
1100 + Reference for various scheduler-related methods in the O(1) scheduler
1101 + Robert Love <rml@tech9.net>, MontaVista Software
1104 +Note most of these methods are local to kernel/sched.c - this is by design.
1105 +The scheduler is meant to be self-contained and abstracted away. This document
1106 +is primarily for understanding the scheduler, not interfacing to it. Some of
1107 +the discussed interfaces, however, are general process/scheduling methods.
1108 +They are typically defined in include/linux/sched.h.
1111 +Main Scheduling Methods
1112 +-----------------------
1114 +void load_balance(runqueue_t *this_rq, int idle)
1115 + Attempts to pull tasks from one cpu to another to balance cpu usage,
1116 + if needed. This method is called explicitly if the runqueues are
1117 + inbalanced or periodically by the timer tick. Prior to calling,
1118 + the current runqueue must be locked and interrupts disabled.
1121 + The main scheduling function. Upon return, the highest priority
1122 + process will be active.
1128 +Each runqueue has its own lock, rq->lock. When multiple runqueues need
1129 +to be locked, lock acquires must be ordered by ascending &runqueue value.
1131 +A specific runqueue is locked via
1133 + task_rq_lock(task_t pid, unsigned long *flags)
1135 +which disables preemption, disables interrupts, and locks the runqueue pid is
1136 +running on. Likewise,
1138 + task_rq_unlock(task_t pid, unsigned long *flags)
1140 +unlocks the runqueue pid is running on, restores interrupts to their previous
1141 +state, and reenables preemption.
1145 + double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1149 + double_rq_unlock(runqueue_t *rq1, runqueue_t rq2)
1151 +safely lock and unlock, respectively, the two specified runqueues. They do
1152 +not, however, disable and restore interrupts. Users are required to do so
1153 +manually before and after calls.
1160 + The maximum priority of the system, stored in the task as task->prio.
1161 + Lower priorities are higher. Normal (non-RT) priorities range from
1162 + MAX_RT_PRIO to (MAX_PRIO - 1).
1164 + The maximum real-time priority of the system. Valid RT priorities
1165 + range from 0 to (MAX_RT_PRIO - 1).
1167 + The maximum real-time priority that is exported to user-space. Should
1168 + always be equal to or less than MAX_RT_PRIO. Setting it less allows
1169 + kernel threads to have higher priorities than any user-space task.
1172 + Respectively, the minimum and maximum timeslices (quanta) of a process.
1178 + The main per-CPU runqueue data structure.
1180 + The main per-process data structure.
1187 + Returns the runqueue of the specified cpu.
1189 + Returns the runqueue of the current cpu.
1191 + Returns the runqueue which holds the specified pid.
1193 + Returns the task currently running on the given cpu.
1195 + Returns true if pid is real-time, false if not.
1198 +Process Control Methods
1199 +-----------------------
1201 +void set_user_nice(task_t *p, long nice)
1202 + Sets the "nice" value of task p to the given value.
1203 +int setscheduler(pid_t pid, int policy, struct sched_param *param)
1204 + Sets the scheduling policy and parameters for the given pid.
1205 +void set_cpus_allowed(task_t *p, unsigned long new_mask)
1206 + Sets a given task's CPU affinity and migrates it to a proper cpu.
1207 + Callers must have a valid reference to the task and assure the
1208 + task not exit prematurely. No locks can be held during the call.
1209 +set_task_state(tsk, state_value)
1210 + Sets the given task's state to the given value.
1211 +set_current_state(state_value)
1212 + Sets the current task's state to the given value.
1213 +void set_tsk_need_resched(struct task_struct *tsk)
1214 + Sets need_resched in the given task.
1215 +void clear_tsk_need_resched(struct task_struct *tsk)
1216 + Clears need_resched in the given task.
1217 +void set_need_resched()
1218 + Sets need_resched in the current task.
1219 +void clear_need_resched()
1220 + Clears need_resched in the current task.
1222 + Returns true if need_resched is set in the current task, false
1225 + Place the current process at the end of the runqueue and call schedule.
1226 diff -urN linux-2.4.22.org/Documentation/sched-design.txt linux-2.4.22/Documentation/sched-design.txt
1227 --- linux-2.4.22.org/Documentation/sched-design.txt 1970-01-01 01:00:00.000000000 +0100
1228 +++ linux-2.4.22/Documentation/sched-design.txt 2003-11-24 18:39:02.000000000 +0100
1230 + Goals, Design and Implementation of the
1231 + new ultra-scalable O(1) scheduler
1234 + This is an edited version of an email Ingo Molnar sent to
1235 + lkml on 4 Jan 2002. It describes the goals, design, and
1236 + implementation of Ingo's new ultra-scalable O(1) scheduler.
1237 + Last Updated: 18 April 2002.
1243 +The main goal of the new scheduler is to keep all the good things we know
1244 +and love about the current Linux scheduler:
1246 + - good interactive performance even during high load: if the user
1247 + types or clicks then the system must react instantly and must execute
1248 + the user tasks smoothly, even during considerable background load.
1250 + - good scheduling/wakeup performance with 1-2 runnable processes.
1252 + - fairness: no process should stay without any timeslice for any
1253 + unreasonable amount of time. No process should get an unjustly high
1254 + amount of CPU time.
1256 + - priorities: less important tasks can be started with lower priority,
1257 + more important tasks with higher priority.
1259 + - SMP efficiency: no CPU should stay idle if there is work to do.
1261 + - SMP affinity: processes which run on one CPU should stay affine to
1262 + that CPU. Processes should not bounce between CPUs too frequently.
1264 + - plus additional scheduler features: RT scheduling, CPU binding.
1266 +and the goal is also to add a few new things:
1268 + - fully O(1) scheduling. Are you tired of the recalculation loop
1269 + blowing the L1 cache away every now and then? Do you think the goodness
1270 + loop is taking a bit too long to finish if there are lots of runnable
1271 + processes? This new scheduler takes no prisoners: wakeup(), schedule(),
1272 + the timer interrupt are all O(1) algorithms. There is no recalculation
1273 + loop. There is no goodness loop either.
1275 + - 'perfect' SMP scalability. With the new scheduler there is no 'big'
1276 + runqueue_lock anymore - it's all per-CPU runqueues and locks - two
1277 + tasks on two separate CPUs can wake up, schedule and context-switch
1278 + completely in parallel, without any interlocking. All
1279 + scheduling-relevant data is structured for maximum scalability.
1281 + - better SMP affinity. The old scheduler has a particular weakness that
1282 + causes the random bouncing of tasks between CPUs if/when higher
1283 + priority/interactive tasks, this was observed and reported by many
1284 + people. The reason is that the timeslice recalculation loop first needs
1285 + every currently running task to consume its timeslice. But when this
1286 + happens on eg. an 8-way system, then this property starves an
1287 + increasing number of CPUs from executing any process. Once the last
1288 + task that has a timeslice left has finished using up that timeslice,
1289 + the recalculation loop is triggered and other CPUs can start executing
1290 + tasks again - after having idled around for a number of timer ticks.
1291 + The more CPUs, the worse this effect.
1293 + Furthermore, this same effect causes the bouncing effect as well:
1294 + whenever there is such a 'timeslice squeeze' of the global runqueue,
1295 + idle processors start executing tasks which are not affine to that CPU.
1296 + (because the affine tasks have finished off their timeslices already.)
1298 + The new scheduler solves this problem by distributing timeslices on a
1299 + per-CPU basis, without having any global synchronization or
1302 + - batch scheduling. A significant proportion of computing-intensive tasks
1303 + benefit from batch-scheduling, where timeslices are long and processes
1304 + are roundrobin scheduled. The new scheduler does such batch-scheduling
1305 + of the lowest priority tasks - so nice +19 jobs will get
1306 + 'batch-scheduled' automatically. With this scheduler, nice +19 jobs are
1307 + in essence SCHED_IDLE, from an interactiveness point of view.
1309 + - handle extreme loads more smoothly, without breakdown and scheduling
1312 + - O(1) RT scheduling. For those RT folks who are paranoid about the
1313 + O(nr_running) property of the goodness loop and the recalculation loop.
1315 + - run fork()ed children before the parent. Andrea has pointed out the
1316 + advantages of this a few months ago, but patches for this feature
1317 + do not work with the old scheduler as well as they should,
1318 + because idle processes often steal the new child before the fork()ing
1319 + CPU gets to execute it.
1325 +the core of the new scheduler are the following mechanizms:
1327 + - *two*, priority-ordered 'priority arrays' per CPU. There is an 'active'
1328 + array and an 'expired' array. The active array contains all tasks that
1329 + are affine to this CPU and have timeslices left. The expired array
1330 + contains all tasks which have used up their timeslices - but this array
1331 + is kept sorted as well. The active and expired array is not accessed
1332 + directly, it's accessed through two pointers in the per-CPU runqueue
1333 + structure. If all active tasks are used up then we 'switch' the two
1334 + pointers and from now on the ready-to-go (former-) expired array is the
1335 + active array - and the empty active array serves as the new collector
1336 + for expired tasks.
1338 + - there is a 64-bit bitmap cache for array indices. Finding the highest
1339 + priority task is thus a matter of two x86 BSFL bit-search instructions.
1341 +the split-array solution enables us to have an arbitrary number of active
1342 +and expired tasks, and the recalculation of timeslices can be done
1343 +immediately when the timeslice expires. Because the arrays are always
1344 +access through the pointers in the runqueue, switching the two arrays can
1345 +be done very quickly.
1347 +this is a hybride priority-list approach coupled with roundrobin
1348 +scheduling and the array-switch method of distributing timeslices.
1350 + - there is a per-task 'load estimator'.
1352 +one of the toughest things to get right is good interactive feel during
1353 +heavy system load. While playing with various scheduler variants i found
1354 +that the best interactive feel is achieved not by 'boosting' interactive
1355 +tasks, but by 'punishing' tasks that want to use more CPU time than there
1356 +is available. This method is also much easier to do in an O(1) fashion.
1358 +to establish the actual 'load' the task contributes to the system, a
1359 +complex-looking but pretty accurate method is used: there is a 4-entry
1360 +'history' ringbuffer of the task's activities during the last 4 seconds.
1361 +This ringbuffer is operated without much overhead. The entries tell the
1362 +scheduler a pretty accurate load-history of the task: has it used up more
1363 +CPU time or less during the past N seconds. [the size '4' and the interval
1364 +of 4x 1 seconds was found by lots of experimentation - this part is
1365 +flexible and can be changed in both directions.]
1367 +the penalty a task gets for generating more load than the CPU can handle
1368 +is a priority decrease - there is a maximum amount to this penalty
1369 +relative to their static priority, so even fully CPU-bound tasks will
1370 +observe each other's priorities, and will share the CPU accordingly.
1372 +the SMP load-balancer can be extended/switched with additional parallel
1373 +computing and cache hierarchy concepts: NUMA scheduling, multi-core CPUs
1374 +can be supported easily by changing the load-balancer. Right now it's
1375 +tuned for my SMP systems.
1377 +i skipped the prev->mm == next->mm advantage - no workload i know of shows
1378 +any sensitivity to this. It can be added back by sacrificing O(1)
1379 +schedule() [the current and one-lower priority list can be searched for a
1380 +that->mm == current->mm condition], but costs a fair number of cycles
1381 +during a number of important workloads, so i wanted to avoid this as much
1384 +- the SMP idle-task startup code was still racy and the new scheduler
1385 +triggered this. So i streamlined the idle-setup code a bit. We do not call
1386 +into schedule() before all processors have started up fully and all idle
1387 +threads are in place.
1389 +- the patch also cleans up a number of aspects of sched.c - moves code
1390 +into other areas of the kernel where it's appropriate, and simplifies
1391 +certain code paths and data constructs. As a result, the new scheduler's
1392 +code is smaller than the old one.
1395 diff -urN linux-2.4.22.org/drivers/char/drm-4.0/tdfx_drv.c linux-2.4.22/drivers/char/drm-4.0/tdfx_drv.c
1396 --- linux-2.4.22.org/drivers/char/drm-4.0/tdfx_drv.c 2003-11-24 18:29:04.000000000 +0100
1397 +++ linux-2.4.22/drivers/char/drm-4.0/tdfx_drv.c 2003-11-24 18:39:02.000000000 +0100
1399 lock.context, current->pid, j,
1400 dev->lock.lock_time, jiffies);
1401 current->state = TASK_INTERRUPTIBLE;
1402 - current->policy |= SCHED_YIELD;
1403 schedule_timeout(DRM_LOCK_SLICE-j);
1404 DRM_DEBUG("jiffies=%d\n", jiffies);
1406 diff -urN linux-2.4.22.org/drivers/char/mwave/mwavedd.c linux-2.4.22/drivers/char/mwave/mwavedd.c
1407 --- linux-2.4.22.org/drivers/char/mwave/mwavedd.c 2003-11-24 18:29:03.000000000 +0100
1408 +++ linux-2.4.22/drivers/char/mwave/mwavedd.c 2003-11-24 18:39:02.000000000 +0100
1410 pDrvData->IPCs[ipcnum].bIsHere = FALSE;
1411 pDrvData->IPCs[ipcnum].bIsEnabled = TRUE;
1412 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
1413 - current->nice = -20; /* boost to provide priority timing */
1415 current->priority = 0x28; /* boost to provide priority timing */
1417 diff -urN linux-2.4.22.org/drivers/char/serial_txx927.c linux-2.4.22/drivers/char/serial_txx927.c
1418 --- linux-2.4.22.org/drivers/char/serial_txx927.c 2003-11-24 18:29:01.000000000 +0100
1419 +++ linux-2.4.22/drivers/char/serial_txx927.c 2003-11-24 18:39:02.000000000 +0100
1420 @@ -1533,7 +1533,6 @@
1421 printk("cisr = %d (jiff=%lu)...", cisr, jiffies);
1423 current->state = TASK_INTERRUPTIBLE;
1424 - current->counter = 0; /* make us low-priority */
1425 schedule_timeout(char_time);
1426 if (signal_pending(current))
1428 diff -urN linux-2.4.22.org/drivers/md/md.c linux-2.4.22/drivers/md/md.c
1429 --- linux-2.4.22.org/drivers/md/md.c 2003-11-24 18:29:41.000000000 +0100
1430 +++ linux-2.4.22/drivers/md/md.c 2003-11-24 18:39:02.000000000 +0100
1431 @@ -2939,8 +2939,6 @@
1432 * bdflush, otherwise bdflush will deadlock if there are too
1433 * many dirty RAID5 blocks.
1435 - current->policy = SCHED_OTHER;
1436 - current->nice = -20;
1439 complete(thread->event);
1440 @@ -3464,11 +3462,6 @@
1441 "(but not more than %d KB/sec) for reconstruction.\n",
1442 sysctl_speed_limit_max);
1445 - * Resync has low priority.
1447 - current->nice = 19;
1449 is_mddev_idle(mddev); /* this also initializes IO event counters */
1450 for (m = 0; m < SYNC_MARKS; m++) {
1452 @@ -3546,16 +3539,13 @@
1453 currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
1455 if (currspeed > sysctl_speed_limit_min) {
1456 - current->nice = 19;
1458 if ((currspeed > sysctl_speed_limit_max) ||
1459 !is_mddev_idle(mddev)) {
1460 current->state = TASK_INTERRUPTIBLE;
1461 md_schedule_timeout(HZ/4);
1465 - current->nice = -20;
1468 printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
1470 diff -urN linux-2.4.22.org/fs/binfmt_elf.c linux-2.4.22/fs/binfmt_elf.c
1471 --- linux-2.4.22.org/fs/binfmt_elf.c 2003-11-24 18:28:10.000000000 +0100
1472 +++ linux-2.4.22/fs/binfmt_elf.c 2003-11-24 18:39:02.000000000 +0100
1473 @@ -1173,7 +1173,7 @@
1474 psinfo.pr_state = i;
1475 psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i];
1476 psinfo.pr_zomb = psinfo.pr_sname == 'Z';
1477 - psinfo.pr_nice = current->nice;
1478 + psinfo.pr_nice = task_nice(current);
1479 psinfo.pr_flag = current->flags;
1480 psinfo.pr_uid = NEW_TO_OLD_UID(current->uid);
1481 psinfo.pr_gid = NEW_TO_OLD_GID(current->gid);
1482 diff -urN linux-2.4.22.org/fs/jffs2/background.c linux-2.4.22/fs/jffs2/background.c
1483 --- linux-2.4.22.org/fs/jffs2/background.c 2003-11-24 18:28:15.000000000 +0100
1484 +++ linux-2.4.22/fs/jffs2/background.c 2003-11-24 18:39:02.000000000 +0100
1487 sprintf(current->comm, "jffs2_gcd_mtd%d", c->mtd->index);
1489 - /* FIXME in the 2.2 backport */
1490 - current->nice = 10;
1493 spin_lock_irq(¤t->sigmask_lock);
1494 siginitsetinv (¤t->blocked, sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT));
1495 diff -urN linux-2.4.22.org/fs/proc/array.c linux-2.4.22/fs/proc/array.c
1496 --- linux-2.4.22.org/fs/proc/array.c 2003-11-24 18:28:11.000000000 +0100
1497 +++ linux-2.4.22/fs/proc/array.c 2003-11-24 18:39:02.000000000 +0100
1500 /* scale priority and nice values from timeslices to -20..20 */
1501 /* to make it look like a "normal" Unix priority/nice value */
1502 - priority = task->counter;
1503 - priority = 20 - (priority * 10 + DEF_COUNTER / 2) / DEF_COUNTER;
1504 - nice = task->nice;
1505 + priority = task_prio(task);
1506 + nice = task_nice(task);
1508 read_lock(&tasklist_lock);
1509 ppid = task->pid ? task->p_opptr->pid : 0;
1519 diff -urN linux-2.4.22.org/fs/proc/proc_misc.c linux-2.4.22/fs/proc/proc_misc.c
1520 --- linux-2.4.22.org/fs/proc/proc_misc.c 2003-11-24 18:28:11.000000000 +0100
1521 +++ linux-2.4.22/fs/proc/proc_misc.c 2003-11-24 18:39:02.000000000 +0100
1522 @@ -109,11 +109,11 @@
1523 a = avenrun[0] + (FIXED_1/200);
1524 b = avenrun[1] + (FIXED_1/200);
1525 c = avenrun[2] + (FIXED_1/200);
1526 - len = sprintf(page,"%d.%02d %d.%02d %d.%02d %d/%d %d\n",
1527 + len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
1528 LOAD_INT(a), LOAD_FRAC(a),
1529 LOAD_INT(b), LOAD_FRAC(b),
1530 LOAD_INT(c), LOAD_FRAC(c),
1531 - nr_running, nr_threads, last_pid);
1532 + nr_running(), nr_threads, last_pid);
1533 return proc_calc_metrics(page, start, off, count, eof, len);
1540 - idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;
1541 + idle = init_task.times.tms_utime + init_task.times.tms_stime;
1543 /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
1544 that would overflow about every five days at HZ == 100.
1545 @@ -374,10 +374,10 @@
1548 proc_sprintf(page, &off, &len,
1553 - kstat.context_swtch,
1554 + nr_context_switches(),
1555 xtime.tv_sec - jif / HZ,
1558 diff -urN linux-2.4.22.org/fs/reiserfs/buffer2.c linux-2.4.22/fs/reiserfs/buffer2.c
1559 --- linux-2.4.22.org/fs/reiserfs/buffer2.c 2003-11-24 18:28:15.000000000 +0100
1560 +++ linux-2.4.22/fs/reiserfs/buffer2.c 2003-11-24 18:39:02.000000000 +0100
1562 struct buffer_head * reiserfs_bread (struct super_block *super, int n_block, int n_size)
1564 struct buffer_head *result;
1565 - PROC_EXP( unsigned int ctx_switches = kstat.context_swtch );
1566 + PROC_EXP( unsigned int ctx_switches = nr_context_switches(); );
1568 result = bread (super -> s_dev, n_block, n_size);
1569 PROC_INFO_INC( super, breads );
1570 - PROC_EXP( if( kstat.context_swtch != ctx_switches )
1571 + PROC_EXP( if( nr_context_switches() != ctx_switches )
1572 PROC_INFO_INC( super, bread_miss ) );
1575 diff -urN linux-2.4.22.org/include/asm-alpha/bitops.h linux-2.4.22/include/asm-alpha/bitops.h
1576 --- linux-2.4.22.org/include/asm-alpha/bitops.h 2003-11-24 18:28:26.000000000 +0100
1577 +++ linux-2.4.22/include/asm-alpha/bitops.h 2003-11-24 18:39:02.000000000 +0100
1580 #include <linux/config.h>
1581 #include <linux/kernel.h>
1582 +#include <asm/compiler.h>
1585 * Copyright 1994, Linus Torvalds.
1588 __asm__ __volatile__(
1597 :"=&r" (temp), "=m" (*m)
1598 - :"Ir" (~(1UL << (nr & 31))), "m" (*m));
1599 + :"Ir" (1UL << (nr & 31)), "m" (*m));
1603 * WARNING: non atomic version.
1605 static __inline__ void
1606 -__change_bit(unsigned long nr, volatile void * addr)
1607 +__clear_bit(unsigned long nr, volatile void * addr)
1609 int *m = ((int *) addr) + (nr >> 5);
1611 - *m ^= 1 << (nr & 31);
1612 + *m &= ~(1 << (nr & 31));
1617 :"Ir" (1UL << (nr & 31)), "m" (*m));
1621 + * WARNING: non atomic version.
1623 +static __inline__ void
1624 +__change_bit(unsigned long nr, volatile void * addr)
1626 + int *m = ((int *) addr) + (nr >> 5);
1628 + *m ^= 1 << (nr & 31);
1632 test_and_set_bit(unsigned long nr, volatile void *addr)
1634 @@ -181,20 +193,6 @@
1635 return (old & mask) != 0;
1639 - * WARNING: non atomic version.
1641 -static __inline__ int
1642 -__test_and_change_bit(unsigned long nr, volatile void * addr)
1644 - unsigned long mask = 1 << (nr & 0x1f);
1645 - int *m = ((int *) addr) + (nr >> 5);
1649 - return (old & mask) != 0;
1653 test_and_change_bit(unsigned long nr, volatile void * addr)
1655 @@ -220,6 +218,20 @@
1660 + * WARNING: non atomic version.
1662 +static __inline__ int
1663 +__test_and_change_bit(unsigned long nr, volatile void * addr)
1665 + unsigned long mask = 1 << (nr & 0x1f);
1666 + int *m = ((int *) addr) + (nr >> 5);
1670 + return (old & mask) != 0;
1674 test_bit(int nr, volatile void * addr)
1676 @@ -235,12 +247,15 @@
1678 static inline unsigned long ffz_b(unsigned long x)
1680 - unsigned long sum = 0;
1681 + unsigned long sum, x1, x2, x4;
1683 x = ~x & -~x; /* set first 0 bit, clear others */
1684 - if (x & 0xF0) sum += 4;
1685 - if (x & 0xCC) sum += 2;
1686 - if (x & 0xAA) sum += 1;
1691 + sum += (x4 != 0) * 4;
1696 @@ -257,24 +272,46 @@
1698 __asm__("cmpbge %1,%2,%0" : "=r"(bits) : "r"(word), "r"(~0UL));
1700 - __asm__("extbl %1,%2,%0" : "=r"(bits) : "r"(word), "r"(qofs));
1701 + bits = __kernel_extbl(word, qofs);
1704 return qofs*8 + bofs;
1709 + * __ffs = Find First set bit in word. Undefined if no set bit exists.
1711 +static inline unsigned long __ffs(unsigned long word)
1713 +#if defined(__alpha_cix__) && defined(__alpha_fix__)
1714 + /* Whee. EV67 can calculate it directly. */
1715 + unsigned long result;
1716 + __asm__("cttz %1,%0" : "=r"(result) : "r"(word));
1719 + unsigned long bits, qofs, bofs;
1721 + __asm__("cmpbge $31,%1,%0" : "=r"(bits) : "r"(word));
1722 + qofs = ffz_b(bits);
1723 + bits = __kernel_extbl(word, qofs);
1724 + bofs = ffz_b(~bits);
1726 + return qofs*8 + bofs;
1733 * ffs: find first bit set. This is defined the same way as
1734 * the libc and compiler builtin ffs routines, therefore
1735 - * differs in spirit from the above ffz (man ffs).
1736 + * differs in spirit from the above __ffs.
1739 static inline int ffs(int word)
1741 - int result = ffz(~word);
1742 + int result = __ffs(word);
1743 return word ? result+1 : 0;
1746 @@ -316,6 +353,14 @@
1747 #define hweight16(x) hweight64((x) & 0xfffful)
1748 #define hweight8(x) hweight64((x) & 0xfful)
1750 +static inline unsigned long hweight64(unsigned long w)
1752 + unsigned long result;
1753 + for (result = 0; w ; w >>= 1)
1754 + result += (w & 1);
1758 #define hweight32(x) generic_hweight32(x)
1759 #define hweight16(x) generic_hweight16(x)
1760 #define hweight8(x) generic_hweight8(x)
1761 @@ -365,13 +410,77 @@
1765 - * The optimizer actually does good code for this case..
1766 + * Find next one bit in a bitmap reasonably efficiently.
1768 +static inline unsigned long
1769 +find_next_bit(void * addr, unsigned long size, unsigned long offset)
1771 + unsigned long * p = ((unsigned long *) addr) + (offset >> 6);
1772 + unsigned long result = offset & ~63UL;
1773 + unsigned long tmp;
1775 + if (offset >= size)
1781 + tmp &= ~0UL << offset;
1785 + goto found_middle;
1789 + while (size & ~63UL) {
1790 + if ((tmp = *(p++)))
1791 + goto found_middle;
1799 + tmp &= ~0UL >> (64 - size);
1801 + return result + size;
1803 + return result + __ffs(tmp);
1807 + * The optimizer actually does good code for this case.
1809 #define find_first_zero_bit(addr, size) \
1810 find_next_zero_bit((addr), (size), 0)
1811 +#define find_first_bit(addr, size) \
1812 + find_next_bit((addr), (size), 0)
1817 + * Every architecture must define this function. It's the fastest
1818 + * way of searching a 140-bit bitmap where the first 100 bits are
1819 + * unlikely to be set. It's guaranteed that at least one of the 140
1822 +static inline unsigned long
1823 +sched_find_first_bit(unsigned long b[3])
1825 + unsigned long b0 = b[0], b1 = b[1], b2 = b[2];
1826 + unsigned long ofs;
1828 + ofs = (b1 ? 64 : 128);
1829 + b1 = (b1 ? b1 : b2);
1830 + ofs = (b0 ? 0 : ofs);
1831 + b0 = (b0 ? b0 : b1);
1833 + return __ffs(b0) + ofs;
1837 #define ext2_set_bit __test_and_set_bit
1838 #define ext2_clear_bit __test_and_clear_bit
1839 #define ext2_test_bit test_bit
1840 diff -urN linux-2.4.22.org/include/asm-alpha/smp.h linux-2.4.22/include/asm-alpha/smp.h
1841 --- linux-2.4.22.org/include/asm-alpha/smp.h 2003-11-24 18:28:26.000000000 +0100
1842 +++ linux-2.4.22/include/asm-alpha/smp.h 2003-11-24 18:39:02.000000000 +0100
1844 #define cpu_logical_map(cpu) __cpu_logical_map[cpu]
1846 #define hard_smp_processor_id() __hard_smp_processor_id()
1847 -#define smp_processor_id() (current->processor)
1848 +#define smp_processor_id() (current->cpu)
1850 extern unsigned long cpu_present_mask;
1851 #define cpu_online_map cpu_present_mask
1852 diff -urN linux-2.4.22.org/include/asm-alpha/system.h linux-2.4.22/include/asm-alpha/system.h
1853 --- linux-2.4.22.org/include/asm-alpha/system.h 2003-11-24 18:28:26.000000000 +0100
1854 +++ linux-2.4.22/include/asm-alpha/system.h 2003-11-24 18:39:02.000000000 +0100
1856 extern void halt(void) __attribute__((noreturn));
1857 #define __halt() __asm__ __volatile__ ("call_pal %0 #halt" : : "i" (PAL_halt))
1859 -#define prepare_to_switch() do { } while(0)
1860 #define switch_to(prev,next,last) \
1862 unsigned long pcbb; \
1863 diff -urN linux-2.4.22.org/include/asm-arm/bitops.h linux-2.4.22/include/asm-arm/bitops.h
1864 --- linux-2.4.22.org/include/asm-arm/bitops.h 2003-11-24 18:28:30.000000000 +0100
1865 +++ linux-2.4.22/include/asm-arm/bitops.h 2003-11-24 18:39:02.000000000 +0100
1867 * Copyright 1995, Russell King.
1868 * Various bits and pieces copyrights include:
1869 * Linus Torvalds (test_bit).
1870 + * Big endian support: Copyright 2001, Nicolas Pitre
1871 + * reworked by rmk.
1873 * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
1875 @@ -17,81 +19,271 @@
1879 +#include <asm/system.h>
1881 #define smp_mb__before_clear_bit() do { } while (0)
1882 #define smp_mb__after_clear_bit() do { } while (0)
1885 - * Function prototypes to keep gcc -Wall happy.
1886 + * These functions are the basis of our bit ops.
1887 + * First, the atomic bitops.
1889 + * The endian issue for these functions is handled by the macros below.
1891 -extern void set_bit(int nr, volatile void * addr);
1893 +____atomic_set_bit_mask(unsigned int mask, volatile unsigned char *p)
1895 + unsigned long flags;
1897 + local_irq_save(flags);
1899 + local_irq_restore(flags);
1903 +____atomic_clear_bit_mask(unsigned int mask, volatile unsigned char *p)
1905 + unsigned long flags;
1907 + local_irq_save(flags);
1909 + local_irq_restore(flags);
1913 +____atomic_change_bit_mask(unsigned int mask, volatile unsigned char *p)
1915 + unsigned long flags;
1917 + local_irq_save(flags);
1919 + local_irq_restore(flags);
1922 -static inline void __set_bit(int nr, volatile void *addr)
1924 +____atomic_test_and_set_bit_mask(unsigned int mask, volatile unsigned char *p)
1926 - ((unsigned char *) addr)[nr >> 3] |= (1U << (nr & 7));
1927 + unsigned long flags;
1930 + local_irq_save(flags);
1933 + local_irq_restore(flags);
1935 + return res & mask;
1938 -extern void clear_bit(int nr, volatile void * addr);
1940 +____atomic_test_and_clear_bit_mask(unsigned int mask, volatile unsigned char *p)
1942 + unsigned long flags;
1945 + local_irq_save(flags);
1948 + local_irq_restore(flags);
1950 + return res & mask;
1953 -static inline void __clear_bit(int nr, volatile void *addr)
1955 +____atomic_test_and_change_bit_mask(unsigned int mask, volatile unsigned char *p)
1957 - ((unsigned char *) addr)[nr >> 3] &= ~(1U << (nr & 7));
1958 + unsigned long flags;
1961 + local_irq_save(flags);
1964 + local_irq_restore(flags);
1966 + return res & mask;
1969 -extern void change_bit(int nr, volatile void * addr);
1971 + * Now the non-atomic variants. We let the compiler handle all optimisations
1974 +static inline void ____nonatomic_set_bit(int nr, volatile void *p)
1976 + ((unsigned char *) p)[nr >> 3] |= (1U << (nr & 7));
1979 -static inline void __change_bit(int nr, volatile void *addr)
1980 +static inline void ____nonatomic_clear_bit(int nr, volatile void *p)
1982 - ((unsigned char *) addr)[nr >> 3] ^= (1U << (nr & 7));
1983 + ((unsigned char *) p)[nr >> 3] &= ~(1U << (nr & 7));
1986 -extern int test_and_set_bit(int nr, volatile void * addr);
1987 +static inline void ____nonatomic_change_bit(int nr, volatile void *p)
1989 + ((unsigned char *) p)[nr >> 3] ^= (1U << (nr & 7));
1992 -static inline int __test_and_set_bit(int nr, volatile void *addr)
1993 +static inline int ____nonatomic_test_and_set_bit(int nr, volatile void *p)
1995 unsigned int mask = 1 << (nr & 7);
1996 unsigned int oldval;
1998 - oldval = ((unsigned char *) addr)[nr >> 3];
1999 - ((unsigned char *) addr)[nr >> 3] = oldval | mask;
2000 + oldval = ((unsigned char *) p)[nr >> 3];
2001 + ((unsigned char *) p)[nr >> 3] = oldval | mask;
2002 return oldval & mask;
2005 -extern int test_and_clear_bit(int nr, volatile void * addr);
2007 -static inline int __test_and_clear_bit(int nr, volatile void *addr)
2008 +static inline int ____nonatomic_test_and_clear_bit(int nr, volatile void *p)
2010 unsigned int mask = 1 << (nr & 7);
2011 unsigned int oldval;
2013 - oldval = ((unsigned char *) addr)[nr >> 3];
2014 - ((unsigned char *) addr)[nr >> 3] = oldval & ~mask;
2015 + oldval = ((unsigned char *) p)[nr >> 3];
2016 + ((unsigned char *) p)[nr >> 3] = oldval & ~mask;
2017 return oldval & mask;
2020 -extern int test_and_change_bit(int nr, volatile void * addr);
2022 -static inline int __test_and_change_bit(int nr, volatile void *addr)
2023 +static inline int ____nonatomic_test_and_change_bit(int nr, volatile void *p)
2025 unsigned int mask = 1 << (nr & 7);
2026 unsigned int oldval;
2028 - oldval = ((unsigned char *) addr)[nr >> 3];
2029 - ((unsigned char *) addr)[nr >> 3] = oldval ^ mask;
2030 + oldval = ((unsigned char *) p)[nr >> 3];
2031 + ((unsigned char *) p)[nr >> 3] = oldval ^ mask;
2032 return oldval & mask;
2035 -extern int find_first_zero_bit(void * addr, unsigned size);
2036 -extern int find_next_zero_bit(void * addr, int size, int offset);
2039 * This routine doesn't need to be atomic.
2041 -static inline int test_bit(int nr, const void * addr)
2042 +static inline int ____test_bit(int nr, const void * p)
2044 - return (((unsigned char *) addr)[nr >> 3] >> (nr & 7)) & 1;
2045 + return (((volatile unsigned char *) p)[nr >> 3] >> (nr & 7)) & 1;
2049 + * A note about Endian-ness.
2050 + * -------------------------
2052 + * When the ARM is put into big endian mode via CR15, the processor
2053 + * merely swaps the order of bytes within words, thus:
2055 + * ------------ physical data bus bits -----------
2056 + * D31 ... D24 D23 ... D16 D15 ... D8 D7 ... D0
2057 + * little byte 3 byte 2 byte 1 byte 0
2058 + * big byte 0 byte 1 byte 2 byte 3
2060 + * This means that reading a 32-bit word at address 0 returns the same
2061 + * value irrespective of the endian mode bit.
2063 + * Peripheral devices should be connected with the data bus reversed in
2064 + * "Big Endian" mode. ARM Application Note 61 is applicable, and is
2065 + * available from http://www.arm.com/.
2067 + * The following assumes that the data bus connectivity for big endian
2068 + * mode has been followed.
2070 + * Note that bit 0 is defined to be 32-bit word bit 0, not byte 0 bit 0.
2074 + * Little endian assembly bitops. nr = 0 -> byte 0 bit 0.
2076 +extern void _set_bit_le(int nr, volatile void * p);
2077 +extern void _clear_bit_le(int nr, volatile void * p);
2078 +extern void _change_bit_le(int nr, volatile void * p);
2079 +extern int _test_and_set_bit_le(int nr, volatile void * p);
2080 +extern int _test_and_clear_bit_le(int nr, volatile void * p);
2081 +extern int _test_and_change_bit_le(int nr, volatile void * p);
2082 +extern int _find_first_zero_bit_le(void * p, unsigned size);
2083 +extern int _find_next_zero_bit_le(void * p, int size, int offset);
2086 + * Big endian assembly bitops. nr = 0 -> byte 3 bit 0.
2088 +extern void _set_bit_be(int nr, volatile void * p);
2089 +extern void _clear_bit_be(int nr, volatile void * p);
2090 +extern void _change_bit_be(int nr, volatile void * p);
2091 +extern int _test_and_set_bit_be(int nr, volatile void * p);
2092 +extern int _test_and_clear_bit_be(int nr, volatile void * p);
2093 +extern int _test_and_change_bit_be(int nr, volatile void * p);
2094 +extern int _find_first_zero_bit_be(void * p, unsigned size);
2095 +extern int _find_next_zero_bit_be(void * p, int size, int offset);
2099 + * The __* form of bitops are non-atomic and may be reordered.
2101 +#define ATOMIC_BITOP_LE(name,nr,p) \
2102 + (__builtin_constant_p(nr) ? \
2103 + ____atomic_##name##_mask(1 << ((nr) & 7), \
2104 + ((unsigned char *)(p)) + ((nr) >> 3)) : \
2105 + _##name##_le(nr,p))
2107 +#define ATOMIC_BITOP_BE(name,nr,p) \
2108 + (__builtin_constant_p(nr) ? \
2109 + ____atomic_##name##_mask(1 << ((nr) & 7), \
2110 + ((unsigned char *)(p)) + (((nr) >> 3) ^ 3)) : \
2111 + _##name##_be(nr,p))
2113 +#define NONATOMIC_BITOP_LE(name,nr,p) \
2114 + (____nonatomic_##name(nr, p))
2116 +#define NONATOMIC_BITOP_BE(name,nr,p) \
2117 + (____nonatomic_##name(nr ^ 0x18, p))
2121 + * These are the little endian, atomic definitions.
2123 +#define set_bit(nr,p) ATOMIC_BITOP_LE(set_bit,nr,p)
2124 +#define clear_bit(nr,p) ATOMIC_BITOP_LE(clear_bit,nr,p)
2125 +#define change_bit(nr,p) ATOMIC_BITOP_LE(change_bit,nr,p)
2126 +#define test_and_set_bit(nr,p) ATOMIC_BITOP_LE(test_and_set_bit,nr,p)
2127 +#define test_and_clear_bit(nr,p) ATOMIC_BITOP_LE(test_and_clear_bit,nr,p)
2128 +#define test_and_change_bit(nr,p) ATOMIC_BITOP_LE(test_and_change_bit,nr,p)
2129 +#define test_bit(nr,p) ____test_bit(nr,p)
2130 +#define find_first_zero_bit(p,sz) _find_first_zero_bit_le(p,sz)
2131 +#define find_next_zero_bit(p,sz,off) _find_next_zero_bit_le(p,sz,off)
2134 + * These are the little endian, non-atomic definitions.
2136 +#define __set_bit(nr,p) NONATOMIC_BITOP_LE(set_bit,nr,p)
2137 +#define __clear_bit(nr,p) NONATOMIC_BITOP_LE(clear_bit,nr,p)
2138 +#define __change_bit(nr,p) NONATOMIC_BITOP_LE(change_bit,nr,p)
2139 +#define __test_and_set_bit(nr,p) NONATOMIC_BITOP_LE(test_and_set_bit,nr,p)
2140 +#define __test_and_clear_bit(nr,p) NONATOMIC_BITOP_LE(test_and_clear_bit,nr,p)
2141 +#define __test_and_change_bit(nr,p) NONATOMIC_BITOP_LE(test_and_change_bit,nr,p)
2142 +#define __test_bit(nr,p) ____test_bit(nr,p)
2147 + * These are the big endian, atomic definitions.
2149 +#define set_bit(nr,p) ATOMIC_BITOP_BE(set_bit,nr,p)
2150 +#define clear_bit(nr,p) ATOMIC_BITOP_BE(clear_bit,nr,p)
2151 +#define change_bit(nr,p) ATOMIC_BITOP_BE(change_bit,nr,p)
2152 +#define test_and_set_bit(nr,p) ATOMIC_BITOP_BE(test_and_set_bit,nr,p)
2153 +#define test_and_clear_bit(nr,p) ATOMIC_BITOP_BE(test_and_clear_bit,nr,p)
2154 +#define test_and_change_bit(nr,p) ATOMIC_BITOP_BE(test_and_change_bit,nr,p)
2155 +#define test_bit(nr,p) ____test_bit((nr) ^ 0x18, p)
2156 +#define find_first_zero_bit(p,sz) _find_first_zero_bit_be(p,sz)
2157 +#define find_next_zero_bit(p,sz,off) _find_next_zero_bit_be(p,sz,off)
2160 + * These are the big endian, non-atomic definitions.
2162 +#define __set_bit(nr,p) NONATOMIC_BITOP_BE(set_bit,nr,p)
2163 +#define __clear_bit(nr,p) NONATOMIC_BITOP_BE(clear_bit,nr,p)
2164 +#define __change_bit(nr,p) NONATOMIC_BITOP_BE(change_bit,nr,p)
2165 +#define __test_and_set_bit(nr,p) NONATOMIC_BITOP_BE(test_and_set_bit,nr,p)
2166 +#define __test_and_clear_bit(nr,p) NONATOMIC_BITOP_BE(test_and_clear_bit,nr,p)
2167 +#define __test_and_change_bit(nr,p) NONATOMIC_BITOP_BE(test_and_change_bit,nr,p)
2168 +#define __test_bit(nr,p) ____test_bit((nr) ^ 0x18, p)
2173 * ffz = Find First Zero in word. Undefined if no zero exists,
2174 * so code should check against ~0UL first..
2176 @@ -110,6 +302,29 @@
2180 + * ffz = Find First Zero in word. Undefined if no zero exists,
2181 + * so code should check against ~0UL first..
2183 +static inline unsigned long __ffs(unsigned long word)
2188 + if (word & 0x0000ffff) { k -= 16; word <<= 16; }
2189 + if (word & 0x00ff0000) { k -= 8; word <<= 8; }
2190 + if (word & 0x0f000000) { k -= 4; word <<= 4; }
2191 + if (word & 0x30000000) { k -= 2; word <<= 2; }
2192 + if (word & 0x40000000) { k -= 1; }
2197 + * fls: find last bit set.
2200 +#define fls(x) generic_fls(x)
2203 * ffs: find first bit set. This is defined the same way as
2204 * the libc and compiler builtin ffs routines, therefore
2205 * differs in spirit from the above ffz (man ffs).
2206 @@ -118,6 +333,22 @@
2207 #define ffs(x) generic_ffs(x)
2210 + * Find first bit set in a 168-bit bitmap, where the first
2211 + * 128 bits are unlikely to be set.
2213 +static inline int sched_find_first_bit(unsigned long *b)
2218 + for (off = 0; v = b[off], off < 4; off++) {
2222 + return __ffs(v) + off * 32;
2226 * hweightN: returns the hamming weight (i.e. the number
2227 * of bits set) of a N-bit word
2229 @@ -126,18 +357,25 @@
2230 #define hweight16(x) generic_hweight16(x)
2231 #define hweight8(x) generic_hweight8(x)
2233 -#define ext2_set_bit test_and_set_bit
2234 -#define ext2_clear_bit test_and_clear_bit
2235 -#define ext2_test_bit test_bit
2236 -#define ext2_find_first_zero_bit find_first_zero_bit
2237 -#define ext2_find_next_zero_bit find_next_zero_bit
2239 -/* Bitmap functions for the minix filesystem. */
2240 -#define minix_test_and_set_bit(nr,addr) test_and_set_bit(nr,addr)
2241 -#define minix_set_bit(nr,addr) set_bit(nr,addr)
2242 -#define minix_test_and_clear_bit(nr,addr) test_and_clear_bit(nr,addr)
2243 -#define minix_test_bit(nr,addr) test_bit(nr,addr)
2244 -#define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size)
2246 + * Ext2 is defined to use little-endian byte ordering.
2247 + * These do not need to be atomic.
2249 +#define ext2_set_bit(nr,p) NONATOMIC_BITOP_LE(test_and_set_bit,nr,p)
2250 +#define ext2_clear_bit(nr,p) NONATOMIC_BITOP_LE(test_and_clear_bit,nr,p)
2251 +#define ext2_test_bit(nr,p) __test_bit(nr,p)
2252 +#define ext2_find_first_zero_bit(p,sz) _find_first_zero_bit_le(p,sz)
2253 +#define ext2_find_next_zero_bit(p,sz,off) _find_next_zero_bit_le(p,sz,off)
2256 + * Minix is defined to use little-endian byte ordering.
2257 + * These do not need to be atomic.
2259 +#define minix_set_bit(nr,p) NONATOMIC_BITOP_LE(set_bit,nr,p)
2260 +#define minix_test_bit(nr,p) __test_bit(nr,p)
2261 +#define minix_test_and_set_bit(nr,p) NONATOMIC_BITOP_LE(test_and_set_bit,nr,p)
2262 +#define minix_test_and_clear_bit(nr,p) NONATOMIC_BITOP_LE(test_and_clear_bit,nr,p)
2263 +#define minix_find_first_zero_bit(p,sz) _find_first_zero_bit_le(p,sz)
2265 #endif /* __KERNEL__ */
2267 diff -urN linux-2.4.22.org/include/asm-cris/bitops.h linux-2.4.22/include/asm-cris/bitops.h
2268 --- linux-2.4.22.org/include/asm-cris/bitops.h 2003-11-24 18:28:36.000000000 +0100
2269 +++ linux-2.4.22/include/asm-cris/bitops.h 2003-11-24 18:39:02.000000000 +0100
2271 /* We use generic_ffs so get it; include guards resolve the possible
2272 mutually inclusion. */
2273 #include <linux/bitops.h>
2274 +#include <linux/compiler.h>
2277 * Some hacks to defeat gcc over-optimizations..
2279 #define set_bit(nr, addr) (void)test_and_set_bit(nr, addr)
2280 #define __set_bit(nr, addr) (void)__test_and_set_bit(nr, addr)
2282 +#define __set_bit(nr, addr) (void)__test_and_set_bit(nr, addr)
2285 * clear_bit - Clears a bit in memory
2288 #define clear_bit(nr, addr) (void)test_and_clear_bit(nr, addr)
2289 #define __clear_bit(nr, addr) (void)__test_and_clear_bit(nr, addr)
2291 +#define __clear_bit(nr, addr) (void)__test_and_clear_bit(nr, addr)
2294 * change_bit - Toggle a bit in memory
2297 * It also implies a memory barrier.
2300 -extern __inline__ int test_and_set_bit(int nr, void *addr)
2301 +extern inline int test_and_set_bit(int nr, void *addr)
2303 unsigned int mask, retval;
2304 unsigned long flags;
2305 @@ -119,6 +124,18 @@
2309 +extern inline int __test_and_set_bit(int nr, void *addr)
2311 + unsigned int mask, retval;
2312 + unsigned int *adr = (unsigned int *)addr;
2315 + mask = 1 << (nr & 0x1f);
2316 + retval = (mask & *adr) != 0;
2322 * clear_bit() doesn't provide any barrier for the compiler.
2325 * It also implies a memory barrier.
2328 -extern __inline__ int test_and_clear_bit(int nr, void *addr)
2329 +extern inline int test_and_clear_bit(int nr, void *addr)
2331 unsigned int mask, retval;
2332 unsigned long flags;
2334 * but actually fail. You must protect multiple accesses with a lock.
2337 -extern __inline__ int __test_and_clear_bit(int nr, void *addr)
2338 +extern inline int __test_and_clear_bit(int nr, void *addr)
2340 unsigned int mask, retval;
2341 unsigned int *adr = (unsigned int *)addr;
2343 * It also implies a memory barrier.
2346 -extern __inline__ int test_and_change_bit(int nr, void *addr)
2347 +extern inline int test_and_change_bit(int nr, void *addr)
2349 unsigned int mask, retval;
2350 unsigned long flags;
2353 /* WARNING: non atomic and it can be reordered! */
2355 -extern __inline__ int __test_and_change_bit(int nr, void *addr)
2356 +extern inline int __test_and_change_bit(int nr, void *addr)
2358 unsigned int mask, retval;
2359 unsigned int *adr = (unsigned int *)addr;
2361 * This routine doesn't need to be atomic.
2364 -extern __inline__ int test_bit(int nr, const void *addr)
2365 +extern inline int test_bit(int nr, const void *addr)
2368 unsigned int *adr = (unsigned int *)addr;
2370 * number. They differ in that the first function also inverts all bits
2373 -extern __inline__ unsigned long cris_swapnwbrlz(unsigned long w)
2374 +extern inline unsigned long cris_swapnwbrlz(unsigned long w)
2376 /* Let's just say we return the result in the same register as the
2377 input. Saying we clobber the input but can return the result
2382 -extern __inline__ unsigned long cris_swapwbrlz(unsigned long w)
2383 +extern inline unsigned long cris_swapwbrlz(unsigned long w)
2386 __asm__ ("swapwbr %0 \n\t"
2388 * ffz = Find First Zero in word. Undefined if no zero exists,
2389 * so code should check against ~0UL first..
2391 -extern __inline__ unsigned long ffz(unsigned long w)
2392 +extern inline unsigned long ffz(unsigned long w)
2394 /* The generic_ffs function is used to avoid the asm when the
2395 argument is a constant. */
2397 * Somewhat like ffz but the equivalent of generic_ffs: in contrast to
2398 * ffz we return the first one-bit *plus one*.
2400 -extern __inline__ unsigned long kernel_ffs(unsigned long w)
2401 +extern inline unsigned long kernel_ffs(unsigned long w)
2403 /* The generic_ffs function is used to avoid the asm when the
2404 argument is a constant. */
2406 * @offset: The bitnumber to start searching at
2407 * @size: The maximum size to search
2409 -extern __inline__ int find_next_zero_bit (void * addr, int size, int offset)
2410 +extern inline int find_next_zero_bit (void * addr, int size, int offset)
2412 unsigned long *p = ((unsigned long *) addr) + (offset >> 5);
2413 unsigned long result = offset & ~31UL;
2414 @@ -375,7 +392,45 @@
2415 #define minix_test_bit(nr,addr) test_bit(nr,addr)
2416 #define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size)
2418 -#endif /* __KERNEL__ */
2420 +/* TODO: see below */
2421 +#define sched_find_first_zero_bit(addr) find_first_zero_bit(addr, 168)
2424 +/* TODO: left out pending where to put it.. (there are .h dependencies) */
2427 + * Every architecture must define this function. It's the fastest
2428 + * way of searching a 168-bit bitmap where the first 128 bits are
2429 + * unlikely to be set. It's guaranteed that at least one of the 168
2430 + * bits is cleared.
2433 +#if MAX_RT_PRIO != 128 || MAX_PRIO != 168
2434 +# error update this function.
2437 +#define MAX_RT_PRIO 128
2438 +#define MAX_PRIO 168
2441 +static inline int sched_find_first_zero_bit(char *bitmap)
2443 + unsigned int *b = (unsigned int *)bitmap;
2446 + rt = b[0] & b[1] & b[2] & b[3];
2447 + if (unlikely(rt != 0xffffffff))
2448 + return find_first_zero_bit(bitmap, MAX_RT_PRIO);
2451 + return ffz(b[4]) + MAX_RT_PRIO;
2452 + return ffz(b[5]) + 32 + MAX_RT_PRIO;
2458 +#endif /* __KERNEL__ */
2460 #endif /* _CRIS_BITOPS_H */
2461 diff -urN linux-2.4.22.org/include/asm-generic/bitops.h linux-2.4.22/include/asm-generic/bitops.h
2462 --- linux-2.4.22.org/include/asm-generic/bitops.h 2003-11-24 18:28:24.000000000 +0100
2463 +++ linux-2.4.22/include/asm-generic/bitops.h 2003-11-24 18:39:02.000000000 +0100
2465 return ((mask & *addr) != 0);
2469 + * fls: find last bit set.
2472 +#define fls(x) generic_fls(x)
2477 diff -urN linux-2.4.22.org/include/asm-i386/bitops.h linux-2.4.22/include/asm-i386/bitops.h
2478 --- linux-2.4.22.org/include/asm-i386/bitops.h 2003-11-24 18:28:24.000000000 +0100
2479 +++ linux-2.4.22/include/asm-i386/bitops.h 2003-11-24 18:39:02.000000000 +0100
2483 #include <linux/config.h>
2484 +#include <linux/compiler.h>
2487 * These have to be done with inline assembly: that way the bit-setting
2493 +static __inline__ void __clear_bit(int nr, volatile void * addr)
2495 + __asm__ __volatile__(
2500 #define smp_mb__before_clear_bit() barrier()
2501 #define smp_mb__after_clear_bit() barrier()
2503 @@ -284,6 +293,34 @@
2507 + * find_first_bit - find the first set bit in a memory region
2508 + * @addr: The address to start the search at
2509 + * @size: The maximum size to search
2511 + * Returns the bit-number of the first set bit, not the number of the byte
2512 + * containing a bit.
2514 +static __inline__ int find_first_bit(void * addr, unsigned size)
2519 + /* This looks at memory. Mark it volatile to tell gcc not to move it around */
2520 + __asm__ __volatile__(
2521 + "xorl %%eax,%%eax\n\t"
2524 + "leal -4(%%edi),%%edi\n\t"
2525 + "bsfl (%%edi),%%eax\n"
2526 + "1:\tsubl %%ebx,%%edi\n\t"
2527 + "shll $3,%%edi\n\t"
2528 + "addl %%edi,%%eax"
2529 + :"=a" (res), "=&c" (d0), "=&D" (d1)
2530 + :"1" ((size + 31) >> 5), "2" (addr), "b" (addr));
2535 * find_next_zero_bit - find the first zero bit in a memory region
2536 * @addr: The address to base the search on
2537 * @offset: The bitnumber to start searching at
2542 - * Look for zero in first byte
2543 + * Look for zero in the first 32 bits.
2545 __asm__("bsfl %1,%0\n\t"
2547 @@ -317,6 +354,39 @@
2551 + * find_next_bit - find the first set bit in a memory region
2552 + * @addr: The address to base the search on
2553 + * @offset: The bitnumber to start searching at
2554 + * @size: The maximum size to search
2556 +static __inline__ int find_next_bit (void * addr, int size, int offset)
2558 + unsigned long * p = ((unsigned long *) addr) + (offset >> 5);
2559 + int set = 0, bit = offset & 31, res;
2563 + * Look for nonzero in the first 32 bits:
2565 + __asm__("bsfl %1,%0\n\t"
2570 + : "r" (*p >> bit));
2571 + if (set < (32 - bit))
2572 + return set + offset;
2577 + * No set bit yet, search remaining full words for a bit
2579 + res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr));
2580 + return (offset + set + res);
2584 * ffz - find first zero in word.
2585 * @word: The word to search
2587 @@ -330,8 +400,41 @@
2592 + * __ffs - find first bit in word.
2593 + * @word: The word to search
2594 + * Undefined if no bit exists, so code should check against 0 first.
2596 +static __inline__ unsigned long __ffs(unsigned long word)
2598 + __asm__("bsfl %1,%0"
2603 +#define fls(x) generic_fls(x)
2608 + * Every architecture must define this function. It's the fastest
2609 + * way of searching a 140-bit bitmap where the first 100 bits are
2610 + * unlikely to be set. It's guaranteed that at least one of the 140
2611 + * bits is cleared.
2613 +static inline int sched_find_first_bit(unsigned long *b)
2615 + if (unlikely(b[0]))
2616 + return __ffs(b[0]);
2617 + if (unlikely(b[1]))
2618 + return __ffs(b[1]) + 32;
2619 + if (unlikely(b[2]))
2620 + return __ffs(b[2]) + 64;
2622 + return __ffs(b[3]) + 96;
2623 + return __ffs(b[4]) + 128;
2627 * ffs - find first bit set
2628 * @x: the word to search
2629 diff -urN linux-2.4.22.org/include/asm-i386/mmu_context.h linux-2.4.22/include/asm-i386/mmu_context.h
2630 --- linux-2.4.22.org/include/asm-i386/mmu_context.h 2003-11-24 18:28:24.000000000 +0100
2631 +++ linux-2.4.22/include/asm-i386/mmu_context.h 2003-11-24 18:39:02.000000000 +0100
2634 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu)
2636 - if (prev != next) {
2637 + if (likely(prev != next)) {
2638 /* stop flush ipis for the previous mm */
2639 clear_bit(cpu, &prev->cpu_vm_mask);
2641 * Re-load LDT if necessary
2643 - if (prev->context.segments != next->context.segments)
2644 + if (unlikely(prev->context.segments != next->context.segments))
2647 cpu_tlbstate[cpu].state = TLBSTATE_OK;
2648 diff -urN linux-2.4.22.org/include/asm-i386/processor.h linux-2.4.22/include/asm-i386/processor.h
2649 --- linux-2.4.22.org/include/asm-i386/processor.h 2003-11-24 18:28:24.000000000 +0100
2650 +++ linux-2.4.22/include/asm-i386/processor.h 2003-11-24 18:39:02.000000000 +0100
2653 #define cpu_relax() rep_nop()
2655 +#define ARCH_HAS_SMP_BALANCE
2657 /* Prefetch instructions for Pentium III and AMD Athlon */
2658 #if defined(CONFIG_MPENTIUMIII) || defined (CONFIG_MPENTIUM4)
2660 diff -urN linux-2.4.22.org/include/asm-i386/smp_balance.h linux-2.4.22/include/asm-i386/smp_balance.h
2661 --- linux-2.4.22.org/include/asm-i386/smp_balance.h 1970-01-01 01:00:00.000000000 +0100
2662 +++ linux-2.4.22/include/asm-i386/smp_balance.h 2003-11-24 18:39:02.000000000 +0100
2664 +#ifndef _ASM_SMP_BALANCE_H
2665 +#define _ASM_SMP_BALANCE_H
2668 + * We have an architecture-specific SMP load balancer to improve
2669 + * scheduling behavior on hyperthreaded CPUs. Since only P4s have
2670 + * HT, maybe this should be conditional on CONFIG_MPENTIUM4...
2675 + * Find any idle processor package (i.e. both virtual processors are idle)
2677 +static inline int find_idle_package(int this_cpu)
2681 + this_cpu = cpu_number_map(this_cpu);
2683 + for (i = (this_cpu + 1) % smp_num_cpus;
2685 + i = (i + 1) % smp_num_cpus) {
2686 + int physical = cpu_logical_map(i);
2687 + int sibling = cpu_sibling_map[physical];
2689 + if (idle_cpu(physical) && idle_cpu(sibling))
2692 + return -1; /* not found */
2695 +static inline int arch_reschedule_idle_override(task_t * p, int idle)
2697 + if (unlikely(smp_num_siblings > 1) && !idle_cpu(cpu_sibling_map[idle])) {
2698 + int true_idle = find_idle_package(idle);
2699 + if (true_idle >= 0) {
2700 + if (likely(p->cpus_allowed & (1UL << true_idle)))
2703 + true_idle = cpu_sibling_map[true_idle];
2704 + if (p->cpus_allowed & (1UL << true_idle))
2713 +static inline int arch_load_balance(int this_cpu, int idle)
2715 + /* Special hack for hyperthreading */
2716 + if (unlikely(smp_num_siblings > 1 && idle == 2 && !idle_cpu(cpu_sibling_map[this_cpu]))) {
2718 + struct runqueue *rq_target;
2720 + if ((found = find_idle_package(this_cpu)) >= 0 ) {
2721 + rq_target = cpu_rq(found);
2722 + resched_task(rq_target->idle);
2729 +#endif /* _ASM_SMP_BALANCE_H */
2730 diff -urN linux-2.4.22.org/include/asm-i386/smp.h linux-2.4.22/include/asm-i386/smp.h
2731 --- linux-2.4.22.org/include/asm-i386/smp.h 2003-11-24 18:28:24.000000000 +0100
2732 +++ linux-2.4.22/include/asm-i386/smp.h 2003-11-24 18:39:02.000000000 +0100
2734 extern void smp_flush_tlb(void);
2735 extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
2736 extern void smp_send_reschedule(int cpu);
2737 +extern void smp_send_reschedule_all(void);
2738 extern void smp_invalidate_rcv(void); /* Process an NMI */
2739 extern void (*mtrr_hook) (void);
2740 extern void zap_low_mappings (void);
2742 * so this is correct in the x86 case.
2745 -#define smp_processor_id() (current->processor)
2746 +#define smp_processor_id() (current->cpu)
2748 static __inline int hard_smp_processor_id(void)
2752 #define NO_PROC_ID 0xFF /* No processor magic marker */
2755 - * This magic constant controls our willingness to transfer
2756 - * a process across CPUs. Such a transfer incurs misses on the L1
2757 - * cache, and on a P6 or P5 with multiple L2 caches L2 hits. My
2758 - * gut feeling is this will vary by board in value. For a board
2759 - * with separate L2 cache it probably depends also on the RSS, and
2760 - * for a board with shared L2 cache it ought to decay fast as other
2761 - * processes are run.
2764 -#define PROC_CHANGE_PENALTY 15 /* Schedule penalty */
2768 diff -urN linux-2.4.22.org/include/asm-i386/system.h linux-2.4.22/include/asm-i386/system.h
2769 --- linux-2.4.22.org/include/asm-i386/system.h 2003-11-24 18:28:24.000000000 +0100
2770 +++ linux-2.4.22/include/asm-i386/system.h 2003-11-24 18:39:02.000000000 +0100
2772 struct task_struct; /* one of the stranger aspects of C forward declarations.. */
2773 extern void FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
2775 -#define prepare_to_switch() do { } while(0)
2776 #define switch_to(prev,next,last) do { \
2777 asm volatile("pushl %%esi\n\t" \
2780 "movl %%esp,%0\n\t" /* save ESP */ \
2781 - "movl %3,%%esp\n\t" /* restore ESP */ \
2782 + "movl %2,%%esp\n\t" /* restore ESP */ \
2783 "movl $1f,%1\n\t" /* save EIP */ \
2784 - "pushl %4\n\t" /* restore EIP */ \
2785 + "pushl %3\n\t" /* restore EIP */ \
2786 "jmp __switch_to\n" \
2791 - :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \
2793 + :"=m" (prev->thread.esp),"=m" (prev->thread.eip) \
2794 :"m" (next->thread.esp),"m" (next->thread.eip), \
2795 - "a" (prev), "d" (next), \
2797 + "a" (prev), "d" (next)); \
2800 #define _set_base(addr,base) do { unsigned long __pr; \
2801 diff -urN linux-2.4.22.org/include/asm-ia64/bitops.h linux-2.4.22/include/asm-ia64/bitops.h
2802 --- linux-2.4.22.org/include/asm-ia64/bitops.h 2003-11-24 18:28:32.000000000 +0100
2803 +++ linux-2.4.22/include/asm-ia64/bitops.h 2003-11-24 18:39:02.000000000 +0100
2806 * Copyright (C) 1998-2003 Hewlett-Packard Co
2807 * David Mosberger-Tang <davidm@hpl.hp.com>
2809 + * 02/06/02 find_next_bit() and find_first_bit() added from Erich Focht's ia64 O(1)
2813 #include <linux/types.h>
2818 + * __clear_bit - Clears a bit in memory (non-atomic version)
2820 +static __inline__ void
2821 +__clear_bit (int nr, volatile void *addr)
2823 + volatile __u32 *p = (__u32 *) addr + (nr >> 5);
2824 + __u32 m = 1 << (nr & 31);
2829 * change_bit - Toggle a bit in memory
2831 * @addr: Address to start counting from
2832 @@ -266,12 +280,11 @@
2836 - * ffz - find the first zero bit in a memory region
2837 - * @x: The address to start the search at
2838 + * ffz - find the first zero bit in a long word
2839 + * @x: The long word to find the bit in
2841 - * Returns the bit-number (0..63) of the first (least significant) zero bit, not
2842 - * the number of the byte containing a bit. Undefined if no zero exists, so
2843 - * code should check against ~0UL first...
2844 + * Returns the bit-number (0..63) of the first (least significant) zero bit. Undefined if
2845 + * no zero exists, so code should check against ~0UL first...
2847 static inline unsigned long
2848 ffz (unsigned long x)
2849 @@ -297,6 +310,21 @@
2854 + * __ffs - find first bit in word.
2855 + * @x: The word to search
2857 + * Undefined if no bit exists, so code should check against 0 first.
2859 +static __inline__ unsigned long
2860 +__ffs (unsigned long x)
2862 + unsigned long result;
2864 + __asm__ ("popcnt %0=%1" : "=r" (result) : "r" ((x - 1) & ~x));
2871 @@ -313,6 +341,12 @@
2872 return exp - 0xffff;
2878 + return ia64_fls((unsigned int) x);
2882 * ffs: find first bit set. This is defined the same way as the libc and compiler builtin
2883 * ffs routines, therefore differs in spirit from the above ffz (man ffs): it operates on
2884 @@ -385,8 +419,53 @@
2886 #define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0)
2889 + * Find next bit in a bitmap reasonably efficiently..
2892 +find_next_bit (void *addr, unsigned long size, unsigned long offset)
2894 + unsigned long *p = ((unsigned long *) addr) + (offset >> 6);
2895 + unsigned long result = offset & ~63UL;
2896 + unsigned long tmp;
2898 + if (offset >= size)
2904 + tmp &= ~0UL << offset;
2908 + goto found_middle;
2912 + while (size & ~63UL) {
2913 + if ((tmp = *(p++)))
2914 + goto found_middle;
2922 + tmp &= ~0UL >> (64-size);
2923 + if (tmp == 0UL) /* Are any bits set? */
2924 + return result + size; /* Nope. */
2926 + return result + __ffs(tmp);
2929 +#define find_first_bit(addr, size) find_next_bit((addr), (size), 0)
2933 +#define __clear_bit(nr, addr) clear_bit(nr, addr)
2935 #define ext2_set_bit test_and_set_bit
2936 #define ext2_clear_bit test_and_clear_bit
2937 #define ext2_test_bit test_bit
2938 @@ -400,6 +479,16 @@
2939 #define minix_test_bit(nr,addr) test_bit(nr,addr)
2940 #define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size)
2943 +sched_find_first_bit (unsigned long *b)
2945 + if (unlikely(b[0]))
2946 + return __ffs(b[0]);
2947 + if (unlikely(b[1]))
2948 + return 64 + __ffs(b[1]);
2949 + return __ffs(b[2]) + 128;
2952 #endif /* __KERNEL__ */
2954 #endif /* _ASM_IA64_BITOPS_H */
2955 diff -urN linux-2.4.22.org/include/asm-m68k/bitops.h linux-2.4.22/include/asm-m68k/bitops.h
2956 --- linux-2.4.22.org/include/asm-m68k/bitops.h 2003-11-24 18:28:27.000000000 +0100
2957 +++ linux-2.4.22/include/asm-m68k/bitops.h 2003-11-24 18:39:02.000000000 +0100
2959 (__builtin_constant_p(nr) ? \
2960 __constant_clear_bit(nr, vaddr) : \
2961 __generic_clear_bit(nr, vaddr))
2962 +#define __clear_bit(nr,vaddr) clear_bit(nr,vaddr)
2964 extern __inline__ void __constant_clear_bit(int nr, volatile void * vaddr)
2966 @@ -239,6 +240,28 @@
2970 +#define __ffs(x) (ffs(x) - 1)
2974 + * Every architecture must define this function. It's the fastest
2975 + * way of searching a 140-bit bitmap where the first 100 bits are
2976 + * unlikely to be set. It's guaranteed that at least one of the 140
2977 + * bits is cleared.
2979 +static inline int sched_find_first_bit(unsigned long *b)
2981 + if (unlikely(b[0]))
2982 + return __ffs(b[0]);
2983 + if (unlikely(b[1]))
2984 + return __ffs(b[1]) + 32;
2985 + if (unlikely(b[2]))
2986 + return __ffs(b[2]) + 64;
2988 + return __ffs(b[3]) + 96;
2989 + return __ffs(b[4]) + 128;
2994 * hweightN: returns the hamming weight (i.e. the number
2995 diff -urN linux-2.4.22.org/include/asm-mips/bitops.h linux-2.4.22/include/asm-mips/bitops.h
2996 --- linux-2.4.22.org/include/asm-mips/bitops.h 2003-11-24 18:28:25.000000000 +0100
2997 +++ linux-2.4.22/include/asm-mips/bitops.h 2003-11-24 18:39:02.000000000 +0100
3000 #ifdef CONFIG_CPU_HAS_LLSC
3002 +#include <asm/mipsregs.h>
3005 * These functions for MIPS ISA > 1 are interrupt and SMP proof and
3006 * interrupt friendly
3007 @@ -593,21 +595,30 @@
3009 * Undefined if no zero exists, so code should check against ~0UL first.
3011 -static __inline__ unsigned long ffz(unsigned long word)
3012 +extern __inline__ unsigned long ffz(unsigned long word)
3015 + unsigned int __res;
3016 + unsigned int mask = 1;
3019 - s = 16; if (word << 16 != 0) s = 0; b += s; word >>= s;
3020 - s = 8; if (word << 24 != 0) s = 0; b += s; word >>= s;
3021 - s = 4; if (word << 28 != 0) s = 0; b += s; word >>= s;
3022 - s = 2; if (word << 30 != 0) s = 0; b += s; word >>= s;
3023 - s = 1; if (word << 31 != 0) s = 0; b += s;
3025 + ".set\tnoreorder\n\t"
3028 + "1:\tand\t$1,%2,%1\n\t"
3036 + : "=&r" (__res), "=r" (mask)
3037 + : "r" (word), "1" (mask)
3048 diff -urN linux-2.4.22.org/include/asm-mips64/bitops.h linux-2.4.22/include/asm-mips64/bitops.h
3049 --- linux-2.4.22.org/include/asm-mips64/bitops.h 2003-11-24 18:28:33.000000000 +0100
3050 +++ linux-2.4.22/include/asm-mips64/bitops.h 2003-11-24 18:39:03.000000000 +0100
3053 #include <asm/system.h>
3054 #include <asm/sgidefs.h>
3055 +#include <asm/mipsregs.h>
3058 * set_bit - Atomically set a bit in memory
3060 * Note that @nr may be almost arbitrarily large; this function is not
3061 * restricted to acting on a single-word quantity.
3063 -static inline void set_bit(unsigned long nr, volatile void *addr)
3064 +extern __inline__ void
3065 +set_bit(unsigned long nr, volatile void *addr)
3067 unsigned long *m = ((unsigned long *) addr) + (nr >> 6);
3070 * If it's called on the same region of memory simultaneously, the effect
3071 * may be that only one operation succeeds.
3073 -static inline void __set_bit(int nr, volatile void * addr)
3074 +extern __inline__ void __set_bit(int nr, volatile void * addr)
3076 unsigned long * m = ((unsigned long *) addr) + (nr >> 6);
3079 * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
3080 * in order to ensure changes are visible on other processors.
3082 -static inline void clear_bit(unsigned long nr, volatile void *addr)
3083 +extern __inline__ void
3084 +clear_bit(unsigned long nr, volatile void *addr)
3086 unsigned long *m = ((unsigned long *) addr) + (nr >> 6);
3089 * Note that @nr may be almost arbitrarily large; this function is not
3090 * restricted to acting on a single-word quantity.
3092 -static inline void change_bit(unsigned long nr, volatile void *addr)
3093 +extern __inline__ void
3094 +change_bit(unsigned long nr, volatile void *addr)
3096 unsigned long *m = ((unsigned long *) addr) + (nr >> 6);
3099 * If it's called on the same region of memory simultaneously, the effect
3100 * may be that only one operation succeeds.
3102 -static inline void __change_bit(int nr, volatile void * addr)
3103 +extern __inline__ void __change_bit(int nr, volatile void * addr)
3105 unsigned long * m = ((unsigned long *) addr) + (nr >> 6);
3108 * This operation is atomic and cannot be reordered.
3109 * It also implies a memory barrier.
3111 -static inline unsigned long test_and_set_bit(unsigned long nr,
3112 - volatile void *addr)
3113 +extern __inline__ unsigned long
3114 +test_and_set_bit(unsigned long nr, volatile void *addr)
3116 unsigned long *m = ((unsigned long *) addr) + (nr >> 6);
3117 unsigned long temp, res;
3119 * If two examples of this operation race, one can appear to succeed
3120 * but actually fail. You must protect multiple accesses with a lock.
3122 -static inline int __test_and_set_bit(int nr, volatile void *addr)
3123 +extern __inline__ int
3124 +__test_and_set_bit(int nr, volatile void * addr)
3126 unsigned long mask, retval;
3127 long *a = (unsigned long *) addr;
3129 * This operation is atomic and cannot be reordered.
3130 * It also implies a memory barrier.
3132 -static inline unsigned long test_and_clear_bit(unsigned long nr,
3133 - volatile void *addr)
3134 +extern __inline__ unsigned long
3135 +test_and_clear_bit(unsigned long nr, volatile void *addr)
3137 unsigned long *m = ((unsigned long *) addr) + (nr >> 6);
3138 unsigned long temp, res;
3140 * If two examples of this operation race, one can appear to succeed
3141 * but actually fail. You must protect multiple accesses with a lock.
3143 -static inline int __test_and_clear_bit(int nr, volatile void * addr)
3144 +extern __inline__ int
3145 +__test_and_clear_bit(int nr, volatile void * addr)
3147 unsigned long mask, retval;
3148 unsigned long *a = (unsigned long *) addr;
3150 * This operation is atomic and cannot be reordered.
3151 * It also implies a memory barrier.
3153 -static inline unsigned long test_and_change_bit(unsigned long nr,
3154 - volatile void *addr)
3155 +extern __inline__ unsigned long
3156 +test_and_change_bit(unsigned long nr, volatile void *addr)
3158 unsigned long *m = ((unsigned long *) addr) + (nr >> 6);
3159 unsigned long temp, res;
3161 * If two examples of this operation race, one can appear to succeed
3162 * but actually fail. You must protect multiple accesses with a lock.
3164 -static inline int __test_and_change_bit(int nr, volatile void *addr)
3165 +extern __inline__ int
3166 +__test_and_change_bit(int nr, volatile void * addr)
3168 unsigned long mask, retval;
3169 unsigned long *a = (unsigned long *) addr;
3171 * @nr: bit number to test
3172 * @addr: Address to start counting from
3174 -static inline int test_bit(int nr, volatile void * addr)
3175 +extern __inline__ unsigned long
3176 +test_bit(int nr, volatile void * addr)
3178 return 1UL & (((const volatile unsigned long *) addr)[nr >> SZLONG_LOG] >> (nr & SZLONG_MASK));
3180 @@ -313,19 +321,20 @@
3182 * Undefined if no zero exists, so code should check against ~0UL first.
3184 -static __inline__ unsigned long ffz(unsigned long word)
3185 +extern __inline__ unsigned long ffz(unsigned long word)
3191 - s = 32; if (word << 32 != 0) s = 0; b += s; word >>= s;
3192 - s = 16; if (word << 48 != 0) s = 0; b += s; word >>= s;
3193 - s = 8; if (word << 56 != 0) s = 0; b += s; word >>= s;
3194 - s = 4; if (word << 60 != 0) s = 0; b += s; word >>= s;
3195 - s = 2; if (word << 62 != 0) s = 0; b += s; word >>= s;
3196 - s = 1; if (word << 63 != 0) s = 0; b += s;
3198 + if (word & 0x00000000ffffffffUL) { k -= 32; word <<= 32; }
3199 + if (word & 0x0000ffff00000000UL) { k -= 16; word <<= 16; }
3200 + if (word & 0x00ff000000000000UL) { k -= 8; word <<= 8; }
3201 + if (word & 0x0f00000000000000UL) { k -= 4; word <<= 4; }
3202 + if (word & 0x3000000000000000UL) { k -= 2; word <<= 2; }
3203 + if (word & 0x4000000000000000UL) { k -= 1; }
3211 * @offset: The bitnumber to start searching at
3212 * @size: The maximum size to search
3214 -static inline unsigned long find_next_zero_bit(void *addr, unsigned long size,
3215 - unsigned long offset)
3216 +extern __inline__ unsigned long
3217 +find_next_zero_bit(void *addr, unsigned long size, unsigned long offset)
3219 unsigned long *p = ((unsigned long *) addr) + (offset >> SZLONG_LOG);
3220 unsigned long result = offset & ~SZLONG_MASK;
3222 #define hweight16(x) generic_hweight16(x)
3223 #define hweight8(x) generic_hweight8(x)
3225 -static inline int __test_and_set_le_bit(unsigned long nr, void * addr)
3227 +__test_and_set_le_bit(unsigned long nr, void * addr
3229 unsigned char *ADDR = (unsigned char *) addr;
3235 -static inline int __test_and_clear_le_bit(unsigned long nr, void * addr)
3237 +__test_and_clear_le_bit(unsigned long nr, void * addr)
3239 unsigned char *ADDR = (unsigned char *) addr;
3245 -static inline int test_le_bit(unsigned long nr, const void * addr)
3247 +test_le_bit(unsigned long nr, const void * addr)
3249 const unsigned char *ADDR = (const unsigned char *) addr;
3255 -static inline unsigned long find_next_zero_le_bit(void *addr,
3256 +extern inline unsigned long find_next_zero_le_bit(void *addr,
3257 unsigned long size, unsigned long offset)
3259 unsigned int *p = ((unsigned int *) addr) + (offset >> 5);
3260 diff -urN linux-2.4.22.org/include/asm-ppc/bitops.h linux-2.4.22/include/asm-ppc/bitops.h
3261 --- linux-2.4.22.org/include/asm-ppc/bitops.h 2003-11-24 18:28:28.000000000 +0100
3262 +++ linux-2.4.22/include/asm-ppc/bitops.h 2003-11-24 18:39:03.000000000 +0100
3264 #define _PPC_BITOPS_H
3266 #include <linux/config.h>
3267 +#include <linux/compiler.h>
3268 #include <asm/byteorder.h>
3269 #include <asm/atomic.h>
3272 * These used to be if'd out here because using : "cc" as a constraint
3273 * resulted in errors from egcs. Things appear to be OK with gcc-2.95.
3275 -static __inline__ void set_bit(int nr, volatile void * addr)
3276 +static __inline__ void set_bit(int nr, volatile unsigned long * addr)
3279 unsigned long mask = 1 << (nr & 0x1f);
3282 * non-atomic version
3284 -static __inline__ void __set_bit(int nr, volatile void *addr)
3285 +static __inline__ void __set_bit(int nr, volatile unsigned long *addr)
3287 unsigned long mask = 1 << (nr & 0x1f);
3288 unsigned long *p = ((unsigned long *)addr) + (nr >> 5);
3290 #define smp_mb__before_clear_bit() smp_mb()
3291 #define smp_mb__after_clear_bit() smp_mb()
3293 -static __inline__ void clear_bit(int nr, volatile void *addr)
3294 +static __inline__ void clear_bit(int nr, volatile unsigned long *addr)
3297 unsigned long mask = 1 << (nr & 0x1f);
3300 * non-atomic version
3302 -static __inline__ void __clear_bit(int nr, volatile void *addr)
3303 +static __inline__ void __clear_bit(int nr, volatile unsigned long *addr)
3305 unsigned long mask = 1 << (nr & 0x1f);
3306 unsigned long *p = ((unsigned long *)addr) + (nr >> 5);
3311 -static __inline__ void change_bit(int nr, volatile void *addr)
3312 +static __inline__ void change_bit(int nr, volatile unsigned long *addr)
3315 unsigned long mask = 1 << (nr & 0x1f);
3318 * non-atomic version
3320 -static __inline__ void __change_bit(int nr, volatile void *addr)
3321 +static __inline__ void __change_bit(int nr, volatile unsigned long *addr)
3323 unsigned long mask = 1 << (nr & 0x1f);
3324 unsigned long *p = ((unsigned long *)addr) + (nr >> 5);
3327 * test_and_*_bit do imply a memory barrier (?)
3329 -static __inline__ int test_and_set_bit(int nr, volatile void *addr)
3330 +static __inline__ int test_and_set_bit(int nr, volatile unsigned long *addr)
3332 unsigned int old, t;
3333 unsigned int mask = 1 << (nr & 0x1f);
3336 * non-atomic version
3338 -static __inline__ int __test_and_set_bit(int nr, volatile void *addr)
3339 +static __inline__ int __test_and_set_bit(int nr, volatile unsigned long *addr)
3341 unsigned long mask = 1 << (nr & 0x1f);
3342 unsigned long *p = ((unsigned long *)addr) + (nr >> 5);
3344 return (old & mask) != 0;
3347 -static __inline__ int test_and_clear_bit(int nr, volatile void *addr)
3348 +static __inline__ int test_and_clear_bit(int nr, volatile unsigned long *addr)
3350 unsigned int old, t;
3351 unsigned int mask = 1 << (nr & 0x1f);
3354 * non-atomic version
3356 -static __inline__ int __test_and_clear_bit(int nr, volatile void *addr)
3357 +static __inline__ int __test_and_clear_bit(int nr, volatile unsigned long *addr)
3359 unsigned long mask = 1 << (nr & 0x1f);
3360 unsigned long *p = ((unsigned long *)addr) + (nr >> 5);
3362 return (old & mask) != 0;
3365 -static __inline__ int test_and_change_bit(int nr, volatile void *addr)
3366 +static __inline__ int test_and_change_bit(int nr, volatile unsigned long *addr)
3368 unsigned int old, t;
3369 unsigned int mask = 1 << (nr & 0x1f);
3372 * non-atomic version
3374 -static __inline__ int __test_and_change_bit(int nr, volatile void *addr)
3375 +static __inline__ int __test_and_change_bit(int nr, volatile unsigned long *addr)
3377 unsigned long mask = 1 << (nr & 0x1f);
3378 unsigned long *p = ((unsigned long *)addr) + (nr >> 5);
3380 return (old & mask) != 0;
3383 -static __inline__ int test_bit(int nr, __const__ volatile void *addr)
3384 +static __inline__ int test_bit(int nr, __const__ volatile unsigned long *addr)
3386 __const__ unsigned int *p = (__const__ unsigned int *) addr;
3391 /* Return the bit position of the most significant 1 bit in a word */
3392 -static __inline__ int __ilog2(unsigned int x)
3393 +static __inline__ int __ilog2(unsigned long x)
3397 @@ -234,13 +235,18 @@
3401 -static __inline__ int ffz(unsigned int x)
3402 +static __inline__ int ffz(unsigned long x)
3406 return __ilog2(x & -x);
3409 +static inline int __ffs(unsigned long x)
3411 + return __ilog2(x & -x);
3415 * ffs: find first bit set. This is defined the same way as
3416 * the libc and compiler builtin ffs routines, therefore
3417 @@ -252,6 +258,18 @@
3421 + * fls: find last (most-significant) bit set.
3422 + * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
3424 +static __inline__ int fls(unsigned int x)
3428 + asm ("cntlzw %0,%1" : "=r" (lz) : "r" (x));
3433 * hweightN: returns the hamming weight (i.e. the number
3434 * of bits set) of a N-bit word
3436 @@ -261,13 +279,86 @@
3437 #define hweight8(x) generic_hweight8(x)
3440 + * Find the first bit set in a 140-bit bitmap.
3441 + * The first 100 bits are unlikely to be set.
3443 +static inline int sched_find_first_bit(unsigned long *b)
3445 + if (unlikely(b[0]))
3446 + return __ffs(b[0]);
3447 + if (unlikely(b[1]))
3448 + return __ffs(b[1]) + 32;
3449 + if (unlikely(b[2]))
3450 + return __ffs(b[2]) + 64;
3452 + return __ffs(b[3]) + 96;
3453 + return __ffs(b[4]) + 128;
3457 + * find_next_bit - find the next set bit in a memory region
3458 + * @addr: The address to base the search on
3459 + * @offset: The bitnumber to start searching at
3460 + * @size: The maximum size to search
3462 +static __inline__ unsigned long find_next_bit(unsigned long *addr,
3463 + unsigned long size, unsigned long offset)
3465 + unsigned int *p = ((unsigned int *) addr) + (offset >> 5);
3466 + unsigned int result = offset & ~31UL;
3469 + if (offset >= size)
3475 + tmp &= ~0UL << offset;
3479 + goto found_middle;
3483 + while (size >= 32) {
3484 + if ((tmp = *p++) != 0)
3485 + goto found_middle;
3494 + tmp &= ~0UL >> (32 - size);
3495 + if (tmp == 0UL) /* Are any bits set? */
3496 + return result + size; /* Nope. */
3498 + return result + __ffs(tmp);
3502 + * find_first_bit - find the first set bit in a memory region
3503 + * @addr: The address to start the search at
3504 + * @size: The maximum size to search
3506 + * Returns the bit-number of the first set bit, not the number of the byte
3507 + * containing a bit.
3509 +#define find_first_bit(addr, size) \
3510 + find_next_bit((addr), (size), 0)
3513 * This implementation of find_{first,next}_zero_bit was stolen from
3514 * Linus' asm-alpha/bitops.h.
3516 #define find_first_zero_bit(addr, size) \
3517 find_next_zero_bit((addr), (size), 0)
3519 -static __inline__ unsigned long find_next_zero_bit(void * addr,
3520 +static __inline__ unsigned long find_next_zero_bit(unsigned long * addr,
3521 unsigned long size, unsigned long offset)
3523 unsigned int * p = ((unsigned int *) addr) + (offset >> 5);
3528 -#define ext2_set_bit(nr, addr) __test_and_set_bit((nr) ^ 0x18, addr)
3529 -#define ext2_clear_bit(nr, addr) __test_and_clear_bit((nr) ^ 0x18, addr)
3530 +#define ext2_set_bit(nr, addr) __test_and_set_bit((nr) ^ 0x18, (unsigned long *)(addr))
3531 +#define ext2_clear_bit(nr, addr) __test_and_clear_bit((nr) ^ 0x18, (unsigned long *)(addr))
3533 static __inline__ int ext2_test_bit(int nr, __const__ void * addr)
3535 diff -urN linux-2.4.22.org/include/asm-ppc/smp.h linux-2.4.22/include/asm-ppc/smp.h
3536 --- linux-2.4.22.org/include/asm-ppc/smp.h 2003-11-24 18:28:28.000000000 +0100
3537 +++ linux-2.4.22/include/asm-ppc/smp.h 2003-11-24 18:39:03.000000000 +0100
3539 #define cpu_logical_map(cpu) (cpu)
3540 #define cpu_number_map(x) (x)
3542 -#define smp_processor_id() (current->processor)
3543 +#define smp_processor_id() (current->cpu)
3545 extern int smp_hw_index[NR_CPUS];
3546 #define hard_smp_processor_id() (smp_hw_index[smp_processor_id()])
3547 diff -urN linux-2.4.22.org/include/asm-ppc64/bitops.h linux-2.4.22/include/asm-ppc64/bitops.h
3548 --- linux-2.4.22.org/include/asm-ppc64/bitops.h 2003-11-24 18:28:17.000000000 +0100
3549 +++ linux-2.4.22/include/asm-ppc64/bitops.h 2003-11-24 18:39:03.000000000 +0100
3551 #define smp_mb__before_clear_bit() smp_mb()
3552 #define smp_mb__after_clear_bit() smp_mb()
3554 -static __inline__ int test_bit(unsigned long nr, __const__ volatile void *addr)
3555 +static __inline__ int test_bit(unsigned long nr, __const__ volatile unsigned long *addr)
3557 return (1UL & (((__const__ long *) addr)[nr >> 6] >> (nr & 63)));
3560 -static __inline__ void set_bit(unsigned long nr, volatile void *addr)
3561 +static __inline__ void set_bit(unsigned long nr, volatile unsigned long *addr)
3564 unsigned long mask = 1UL << (nr & 0x3f);
3569 -static __inline__ void clear_bit(unsigned long nr, volatile void *addr)
3570 +static __inline__ void clear_bit(unsigned long nr, volatile unsigned long *addr)
3573 unsigned long mask = 1UL << (nr & 0x3f);
3578 -static __inline__ void change_bit(unsigned long nr, volatile void *addr)
3579 +static __inline__ void change_bit(unsigned long nr, volatile unsigned long *addr)
3582 unsigned long mask = 1UL << (nr & 0x3f);
3587 -static __inline__ int test_and_set_bit(unsigned long nr, volatile void *addr)
3588 +static __inline__ int test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
3590 unsigned long old, t;
3591 unsigned long mask = 1UL << (nr & 0x3f);
3593 return (old & mask) != 0;
3596 -static __inline__ int test_and_clear_bit(unsigned long nr, volatile void *addr)
3597 +static __inline__ int test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
3599 unsigned long old, t;
3600 unsigned long mask = 1UL << (nr & 0x3f);
3602 return (old & mask) != 0;
3605 -static __inline__ int test_and_change_bit(unsigned long nr, volatile void *addr)
3606 +static __inline__ int test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
3608 unsigned long old, t;
3609 unsigned long mask = 1UL << (nr & 0x3f);
3612 * non-atomic versions
3614 -static __inline__ void __set_bit(unsigned long nr, volatile void *addr)
3615 +static __inline__ void __set_bit(unsigned long nr, volatile unsigned long *addr)
3617 unsigned long mask = 1UL << (nr & 0x3f);
3618 unsigned long *p = ((unsigned long *)addr) + (nr >> 6);
3623 -static __inline__ void __clear_bit(unsigned long nr, volatile void *addr)
3624 +static __inline__ void __clear_bit(unsigned long nr, volatile unsigned long *addr)
3626 unsigned long mask = 1UL << (nr & 0x3f);
3627 unsigned long *p = ((unsigned long *)addr) + (nr >> 6);
3632 -static __inline__ void __change_bit(unsigned long nr, volatile void *addr)
3633 +static __inline__ void __change_bit(unsigned long nr, volatile unsigned long *addr)
3635 unsigned long mask = 1UL << (nr & 0x3f);
3636 unsigned long *p = ((unsigned long *)addr) + (nr >> 6);
3641 -static __inline__ int __test_and_set_bit(unsigned long nr, volatile void *addr)
3642 +static __inline__ int __test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
3644 unsigned long mask = 1UL << (nr & 0x3f);
3645 unsigned long *p = ((unsigned long *)addr) + (nr >> 6);
3647 return (old & mask) != 0;
3650 -static __inline__ int __test_and_clear_bit(unsigned long nr, volatile void *addr)
3651 +static __inline__ int __test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
3653 unsigned long mask = 1UL << (nr & 0x3f);
3654 unsigned long *p = ((unsigned long *)addr) + (nr >> 6);
3656 return (old & mask) != 0;
3659 -static __inline__ int __test_and_change_bit(unsigned long nr, volatile void *addr)
3660 +static __inline__ int __test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
3662 unsigned long mask = 1UL << (nr & 0x3f);
3663 unsigned long *p = ((unsigned long *)addr) + (nr >> 6);
3664 diff -urN linux-2.4.22.org/include/asm-s390/bitops.h linux-2.4.22/include/asm-s390/bitops.h
3665 --- linux-2.4.22.org/include/asm-s390/bitops.h 2003-11-24 18:28:35.000000000 +0100
3666 +++ linux-2.4.22/include/asm-s390/bitops.h 2003-11-24 18:39:03.000000000 +0100
3667 @@ -47,272 +47,217 @@
3668 extern const char _oi_bitmap[];
3669 extern const char _ni_bitmap[];
3670 extern const char _zb_findmap[];
3671 +extern const char _sb_findmap[];
3675 * SMP save set_bit routine based on compare and swap (CS)
3677 -static __inline__ void set_bit_cs(int nr, volatile void * addr)
3678 +static inline void set_bit_cs(int nr, volatile void *ptr)
3680 - unsigned long bits, mask;
3681 - __asm__ __volatile__(
3682 + unsigned long addr, old, new, mask;
3684 + addr = (unsigned long) ptr;
3686 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */
3687 - " nr %2,%1\n" /* isolate last 2 bits of address */
3688 - " xr %1,%2\n" /* make addr % 4 == 0 */
3690 - " ar %0,%2\n" /* add alignement to bitnr */
3691 + addr ^= addr & 3; /* align address to 4 */
3692 + nr += (addr & 3) << 3; /* add alignment to bit number */
3695 - " nr %2,%0\n" /* make shift value */
3699 - " la %1,0(%0,%1)\n" /* calc. address for CS */
3700 - " sll %3,0(%2)\n" /* make OR mask */
3702 - "0: lr %2,%0\n" /* CS loop starts here */
3703 - " or %2,%3\n" /* set bit */
3704 - " cs %0,%2,0(%1)\n"
3706 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
3707 - : "cc", "memory" );
3708 + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */
3709 + mask = 1UL << (nr & 31); /* make OR mask */
3714 + " cs %0,%1,0(%4)\n"
3716 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr)
3717 + : "d" (mask), "a" (addr)
3722 * SMP save clear_bit routine based on compare and swap (CS)
3724 -static __inline__ void clear_bit_cs(int nr, volatile void * addr)
3725 +static inline void clear_bit_cs(int nr, volatile void *ptr)
3727 - static const int minusone = -1;
3728 - unsigned long bits, mask;
3729 - __asm__ __volatile__(
3730 + unsigned long addr, old, new, mask;
3732 + addr = (unsigned long) ptr;
3734 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */
3735 - " nr %2,%1\n" /* isolate last 2 bits of address */
3736 - " xr %1,%2\n" /* make addr % 4 == 0 */
3738 - " ar %0,%2\n" /* add alignement to bitnr */
3739 + addr ^= addr & 3; /* align address to 4 */
3740 + nr += (addr & 3) << 3; /* add alignment to bit number */
3743 - " nr %2,%0\n" /* make shift value */
3747 - " la %1,0(%0,%1)\n" /* calc. address for CS */
3749 - " x %3,%4\n" /* make AND mask */
3751 - "0: lr %2,%0\n" /* CS loop starts here */
3752 - " nr %2,%3\n" /* clear bit */
3753 - " cs %0,%2,0(%1)\n"
3755 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask)
3756 - : "m" (minusone) : "cc", "memory" );
3757 + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */
3758 + mask = ~(1UL << (nr & 31)); /* make AND mask */
3763 + " cs %0,%1,0(%4)\n"
3765 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr)
3766 + : "d" (mask), "a" (addr)
3771 * SMP save change_bit routine based on compare and swap (CS)
3773 -static __inline__ void change_bit_cs(int nr, volatile void * addr)
3774 +static inline void change_bit_cs(int nr, volatile void *ptr)
3776 - unsigned long bits, mask;
3777 - __asm__ __volatile__(
3778 + unsigned long addr, old, new, mask;
3780 + addr = (unsigned long) ptr;
3782 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */
3783 - " nr %2,%1\n" /* isolate last 2 bits of address */
3784 - " xr %1,%2\n" /* make addr % 4 == 0 */
3786 - " ar %0,%2\n" /* add alignement to bitnr */
3787 + addr ^= addr & 3; /* align address to 4 */
3788 + nr += (addr & 3) << 3; /* add alignment to bit number */
3791 - " nr %2,%0\n" /* make shift value */
3795 - " la %1,0(%0,%1)\n" /* calc. address for CS */
3796 - " sll %3,0(%2)\n" /* make XR mask */
3798 - "0: lr %2,%0\n" /* CS loop starts here */
3799 - " xr %2,%3\n" /* change bit */
3800 - " cs %0,%2,0(%1)\n"
3802 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
3803 - : "cc", "memory" );
3804 + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */
3805 + mask = 1UL << (nr & 31); /* make XOR mask */
3810 + " cs %0,%1,0(%4)\n"
3812 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr)
3813 + : "d" (mask), "a" (addr)
3818 * SMP save test_and_set_bit routine based on compare and swap (CS)
3820 -static __inline__ int test_and_set_bit_cs(int nr, volatile void * addr)
3821 +static inline int test_and_set_bit_cs(int nr, volatile void *ptr)
3823 - unsigned long bits, mask;
3824 - __asm__ __volatile__(
3825 + unsigned long addr, old, new, mask;
3827 + addr = (unsigned long) ptr;
3829 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */
3830 - " nr %2,%1\n" /* isolate last 2 bits of address */
3831 - " xr %1,%2\n" /* make addr % 4 == 0 */
3833 - " ar %0,%2\n" /* add alignement to bitnr */
3834 + addr ^= addr & 3; /* align address to 4 */
3835 + nr += (addr & 3) << 3; /* add alignment to bit number */
3838 - " nr %2,%0\n" /* make shift value */
3842 - " la %1,0(%0,%1)\n" /* calc. address for CS */
3843 - " sll %3,0(%2)\n" /* make OR mask */
3845 - "0: lr %2,%0\n" /* CS loop starts here */
3846 - " or %2,%3\n" /* set bit */
3847 - " cs %0,%2,0(%1)\n"
3849 - " nr %0,%3\n" /* isolate old bit */
3850 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
3851 - : "cc", "memory" );
3853 + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */
3854 + mask = 1UL << (nr & 31); /* make OR/test mask */
3859 + " cs %0,%1,0(%4)\n"
3861 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr)
3862 + : "d" (mask), "a" (addr)
3864 + return (old & mask) != 0;
3868 * SMP save test_and_clear_bit routine based on compare and swap (CS)
3870 -static __inline__ int test_and_clear_bit_cs(int nr, volatile void * addr)
3871 +static inline int test_and_clear_bit_cs(int nr, volatile void *ptr)
3873 - static const int minusone = -1;
3874 - unsigned long bits, mask;
3875 - __asm__ __volatile__(
3876 + unsigned long addr, old, new, mask;
3878 + addr = (unsigned long) ptr;
3880 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */
3881 - " nr %2,%1\n" /* isolate last 2 bits of address */
3882 - " xr %1,%2\n" /* make addr % 4 == 0 */
3884 - " ar %0,%2\n" /* add alignement to bitnr */
3885 + addr ^= addr & 3; /* align address to 4 */
3886 + nr += (addr & 3) << 3; /* add alignment to bit number */
3889 - " nr %2,%0\n" /* make shift value */
3893 - " la %1,0(%0,%1)\n" /* calc. address for CS */
3896 - " x %3,%4\n" /* make AND mask */
3897 - "0: lr %2,%0\n" /* CS loop starts here */
3898 - " nr %2,%3\n" /* clear bit */
3899 - " cs %0,%2,0(%1)\n"
3902 - " nr %0,%3\n" /* isolate old bit */
3903 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask)
3904 - : "m" (minusone) : "cc", "memory" );
3906 + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */
3907 + mask = ~(1UL << (nr & 31)); /* make AND mask */
3912 + " cs %0,%1,0(%4)\n"
3914 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr)
3915 + : "d" (mask), "a" (addr)
3917 + return (old ^ new) != 0;
3921 * SMP save test_and_change_bit routine based on compare and swap (CS)
3923 -static __inline__ int test_and_change_bit_cs(int nr, volatile void * addr)
3924 +static inline int test_and_change_bit_cs(int nr, volatile void *ptr)
3926 - unsigned long bits, mask;
3927 - __asm__ __volatile__(
3928 + unsigned long addr, old, new, mask;
3930 + addr = (unsigned long) ptr;
3932 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */
3933 - " nr %2,%1\n" /* isolate last 2 bits of address */
3934 - " xr %1,%2\n" /* make addr % 4 == 0 */
3936 - " ar %0,%2\n" /* add alignement to bitnr */
3937 + addr ^= addr & 3; /* align address to 4 */
3938 + nr += (addr & 3) << 3; /* add alignment to bit number */
3941 - " nr %2,%0\n" /* make shift value */
3945 - " la %1,0(%0,%1)\n" /* calc. address for CS */
3946 - " sll %3,0(%2)\n" /* make OR mask */
3948 - "0: lr %2,%0\n" /* CS loop starts here */
3949 - " xr %2,%3\n" /* change bit */
3950 - " cs %0,%2,0(%1)\n"
3952 - " nr %0,%3\n" /* isolate old bit */
3953 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
3954 - : "cc", "memory" );
3956 + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */
3957 + mask = 1UL << (nr & 31); /* make XOR mask */
3962 + " cs %0,%1,0(%4)\n"
3964 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr)
3965 + : "d" (mask), "a" (addr)
3967 + return (old & mask) != 0;
3969 #endif /* CONFIG_SMP */
3972 * fast, non-SMP set_bit routine
3974 -static __inline__ void __set_bit(int nr, volatile void * addr)
3975 +static inline void __set_bit(int nr, volatile void *ptr)
3977 - unsigned long reg1, reg2;
3978 - __asm__ __volatile__(
3984 - " la %1,0(%1,%3)\n"
3985 - " la %0,0(%0,%4)\n"
3986 - " oc 0(1,%1),0(%0)"
3987 - : "=&a" (reg1), "=&a" (reg2)
3988 - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" );
3991 -static __inline__ void
3992 -__constant_set_bit(const int nr, volatile void * addr)
3996 - __asm__ __volatile__ ("la 1,%0\n\t"
3998 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
3999 - : : "1", "cc", "memory");
4002 - __asm__ __volatile__ ("la 1,%0\n\t"
4004 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4005 - : : "1", "cc", "memory" );
4008 - __asm__ __volatile__ ("la 1,%0\n\t"
4010 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4011 - : : "1", "cc", "memory" );
4014 - __asm__ __volatile__ ("la 1,%0\n\t"
4016 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4017 - : : "1", "cc", "memory" );
4020 - __asm__ __volatile__ ("la 1,%0\n\t"
4022 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4023 - : : "1", "cc", "memory" );
4026 - __asm__ __volatile__ ("la 1,%0\n\t"
4028 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4029 - : : "1", "cc", "memory" );
4032 - __asm__ __volatile__ ("la 1,%0\n\t"
4034 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4035 - : : "1", "cc", "memory" );
4038 - __asm__ __volatile__ ("la 1,%0\n\t"
4040 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4041 - : : "1", "cc", "memory" );
4044 + unsigned long addr;
4046 + addr = (unsigned long) ptr + ((nr ^ 24) >> 3);
4047 + asm volatile("oc 0(1,%1),0(%2)"
4048 + : "+m" (*(char *) addr)
4049 + : "a" (addr), "a" (_oi_bitmap + (nr & 7))
4054 +__constant_set_bit(const int nr, volatile void *ptr)
4056 + unsigned long addr;
4058 + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 3);
4061 + asm volatile ("oi 0(%1),0x01"
4062 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4065 + asm volatile ("oi 0(%1),0x02"
4066 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4069 + asm volatile ("oi 0(%1),0x04"
4070 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4073 + asm volatile ("oi 0(%1),0x08"
4074 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4077 + asm volatile ("oi 0(%1),0x10"
4078 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4081 + asm volatile ("oi 0(%1),0x20"
4082 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4085 + asm volatile ("oi 0(%1),0x40"
4086 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4089 + asm volatile ("oi 0(%1),0x80"
4090 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4095 #define set_bit_simple(nr,addr) \
4096 @@ -323,76 +268,58 @@
4098 * fast, non-SMP clear_bit routine
4100 -static __inline__ void
4101 -__clear_bit(int nr, volatile void * addr)
4103 +__clear_bit(int nr, volatile void *ptr)
4105 - unsigned long reg1, reg2;
4106 - __asm__ __volatile__(
4112 - " la %1,0(%1,%3)\n"
4113 - " la %0,0(%0,%4)\n"
4114 - " nc 0(1,%1),0(%0)"
4115 - : "=&a" (reg1), "=&a" (reg2)
4116 - : "r" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" );
4119 -static __inline__ void
4120 -__constant_clear_bit(const int nr, volatile void * addr)
4124 - __asm__ __volatile__ ("la 1,%0\n\t"
4126 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4127 - : : "1", "cc", "memory" );
4130 - __asm__ __volatile__ ("la 1,%0\n\t"
4132 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4133 - : : "1", "cc", "memory" );
4136 - __asm__ __volatile__ ("la 1,%0\n\t"
4138 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4139 - : : "1", "cc", "memory" );
4142 - __asm__ __volatile__ ("la 1,%0\n\t"
4144 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4145 - : : "1", "cc", "memory" );
4148 - __asm__ __volatile__ ("la 1,%0\n\t"
4150 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4151 - : : "cc", "memory" );
4154 - __asm__ __volatile__ ("la 1,%0\n\t"
4156 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4157 - : : "1", "cc", "memory" );
4160 - __asm__ __volatile__ ("la 1,%0\n\t"
4162 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4163 - : : "1", "cc", "memory" );
4166 - __asm__ __volatile__ ("la 1,%0\n\t"
4168 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4169 - : : "1", "cc", "memory" );
4172 + unsigned long addr;
4174 + addr = (unsigned long) ptr + ((nr ^ 24) >> 3);
4175 + asm volatile("nc 0(1,%1),0(%2)"
4176 + : "+m" (*(char *) addr)
4177 + : "a" (addr), "a" (_ni_bitmap + (nr & 7))
4182 +__constant_clear_bit(const int nr, volatile void *ptr)
4184 + unsigned long addr;
4186 + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 3);
4189 + asm volatile ("ni 0(%1),0xFE"
4190 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4193 + asm volatile ("ni 0(%1),0xFD"
4194 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4197 + asm volatile ("ni 0(%1),0xFB"
4198 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4201 + asm volatile ("ni 0(%1),0xF7"
4202 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4205 + asm volatile ("ni 0(%1),0xEF"
4206 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4209 + asm volatile ("ni 0(%1),0xDF"
4210 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4213 + asm volatile ("ni 0(%1),0xBF"
4214 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4217 + asm volatile ("ni 0(%1),0x7F"
4218 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4223 #define clear_bit_simple(nr,addr) \
4224 @@ -403,75 +330,57 @@
4226 * fast, non-SMP change_bit routine
4228 -static __inline__ void __change_bit(int nr, volatile void * addr)
4229 +static inline void __change_bit(int nr, volatile void *ptr)
4231 - unsigned long reg1, reg2;
4232 - __asm__ __volatile__(
4238 - " la %1,0(%1,%3)\n"
4239 - " la %0,0(%0,%4)\n"
4240 - " xc 0(1,%1),0(%0)"
4241 - : "=&a" (reg1), "=&a" (reg2)
4242 - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" );
4245 -static __inline__ void
4246 -__constant_change_bit(const int nr, volatile void * addr)
4250 - __asm__ __volatile__ ("la 1,%0\n\t"
4252 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4253 - : : "cc", "memory" );
4256 - __asm__ __volatile__ ("la 1,%0\n\t"
4258 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4259 - : : "cc", "memory" );
4262 - __asm__ __volatile__ ("la 1,%0\n\t"
4264 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4265 - : : "cc", "memory" );
4268 - __asm__ __volatile__ ("la 1,%0\n\t"
4270 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4271 - : : "cc", "memory" );
4274 - __asm__ __volatile__ ("la 1,%0\n\t"
4276 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4277 - : : "cc", "memory" );
4280 - __asm__ __volatile__ ("la 1,%0\n\t"
4282 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4283 - : : "1", "cc", "memory" );
4286 - __asm__ __volatile__ ("la 1,%0\n\t"
4288 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4289 - : : "1", "cc", "memory" );
4292 - __asm__ __volatile__ ("la 1,%0\n\t"
4294 - : "=m" (*((volatile char *) addr + ((nr>>3)^3)))
4295 - : : "1", "cc", "memory" );
4298 + unsigned long addr;
4300 + addr = (unsigned long) ptr + ((nr ^ 24) >> 3);
4301 + asm volatile("xc 0(1,%1),0(%2)"
4302 + : "+m" (*(char *) addr)
4303 + : "a" (addr), "a" (_oi_bitmap + (nr & 7))
4308 +__constant_change_bit(const int nr, volatile void *ptr)
4310 + unsigned long addr;
4312 + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 3);
4315 + asm volatile ("xi 0(%1),0x01"
4316 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4319 + asm volatile ("xi 0(%1),0x02"
4320 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4323 + asm volatile ("xi 0(%1),0x04"
4324 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4327 + asm volatile ("xi 0(%1),0x08"
4328 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4331 + asm volatile ("xi 0(%1),0x10"
4332 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4335 + asm volatile ("xi 0(%1),0x20"
4336 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4339 + asm volatile ("xi 0(%1),0x40"
4340 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4343 + asm volatile ("xi 0(%1),0x80"
4344 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
4349 #define change_bit_simple(nr,addr) \
4350 @@ -482,74 +391,54 @@
4352 * fast, non-SMP test_and_set_bit routine
4354 -static __inline__ int test_and_set_bit_simple(int nr, volatile void * addr)
4355 +static inline int test_and_set_bit_simple(int nr, volatile void *ptr)
4357 - unsigned long reg1, reg2;
4359 - __asm__ __volatile__(
4365 - " la %1,0(%1,%4)\n"
4368 - " la %2,0(%2,%5)\n"
4369 - " oc 0(1,%1),0(%2)"
4370 - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2)
4371 - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" );
4372 - return oldbit & 1;
4373 + unsigned long addr;
4376 + addr = (unsigned long) ptr + ((nr ^ 24) >> 3);
4377 + ch = *(unsigned char *) addr;
4378 + asm volatile("oc 0(1,%1),0(%2)"
4379 + : "+m" (*(char *) addr)
4380 + : "a" (addr), "a" (_oi_bitmap + (nr & 7))
4382 + return (ch >> (nr & 7)) & 1;
4384 #define __test_and_set_bit(X,Y) test_and_set_bit_simple(X,Y)
4387 * fast, non-SMP test_and_clear_bit routine
4389 -static __inline__ int test_and_clear_bit_simple(int nr, volatile void * addr)
4390 +static inline int test_and_clear_bit_simple(int nr, volatile void *ptr)
4392 - unsigned long reg1, reg2;
4394 + unsigned long addr;
4397 - __asm__ __volatile__(
4403 - " la %1,0(%1,%4)\n"
4406 - " la %2,0(%2,%5)\n"
4407 - " nc 0(1,%1),0(%2)"
4408 - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2)
4409 - : "r" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" );
4410 - return oldbit & 1;
4411 + addr = (unsigned long) ptr + ((nr ^ 24) >> 3);
4412 + ch = *(unsigned char *) addr;
4413 + asm volatile("nc 0(1,%1),0(%2)"
4414 + : "+m" (*(char *) addr)
4415 + : "a" (addr), "a" (_ni_bitmap + (nr & 7))
4417 + return (ch >> (nr & 7)) & 1;
4419 #define __test_and_clear_bit(X,Y) test_and_clear_bit_simple(X,Y)
4422 * fast, non-SMP test_and_change_bit routine
4424 -static __inline__ int test_and_change_bit_simple(int nr, volatile void * addr)
4425 +static inline int test_and_change_bit_simple(int nr, volatile void *ptr)
4427 - unsigned long reg1, reg2;
4429 + unsigned long addr;
4432 - __asm__ __volatile__(
4438 - " la %1,0(%1,%4)\n"
4441 - " la %2,0(%2,%5)\n"
4442 - " xc 0(1,%1),0(%2)"
4443 - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2)
4444 - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" );
4445 - return oldbit & 1;
4446 + addr = (unsigned long) ptr + ((nr ^ 24) >> 3);
4447 + ch = *(unsigned char *) addr;
4448 + asm volatile("xc 0(1,%1),0(%2)"
4449 + : "+m" (*(char *) addr)
4450 + : "a" (addr), "a" (_oi_bitmap + (nr & 7))
4452 + return (ch >> (nr & 7)) & 1;
4454 #define __test_and_change_bit(X,Y) test_and_change_bit_simple(X,Y)
4456 @@ -574,25 +463,17 @@
4457 * This routine doesn't need to be atomic.
4460 -static __inline__ int __test_bit(int nr, volatile void * addr)
4461 +static inline int __test_bit(int nr, volatile void *ptr)
4463 - unsigned long reg1, reg2;
4465 + unsigned long addr;
4468 - __asm__ __volatile__(
4474 - " ic %0,0(%2,%4)\n"
4476 - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2)
4477 - : "r" (nr), "a" (addr) : "cc" );
4478 - return oldbit & 1;
4479 + addr = (unsigned long) ptr + ((nr ^ 24) >> 3);
4480 + ch = *(unsigned char *) addr;
4481 + return (ch >> (nr & 7)) & 1;
4484 -static __inline__ int __constant_test_bit(int nr, volatile void * addr) {
4485 +static inline int __constant_test_bit(int nr, volatile void * addr) {
4486 return (((volatile char *) addr)[(nr>>3)^3] & (1<<(nr&7))) != 0;
4491 * Find-bit routines..
4493 -static __inline__ int find_first_zero_bit(void * addr, unsigned size)
4494 +static inline int find_first_zero_bit(void * addr, unsigned size)
4496 unsigned long cmp, count;
4498 @@ -642,7 +523,45 @@
4499 return (res < size) ? res : size;
4502 -static __inline__ int find_next_zero_bit (void * addr, int size, int offset)
4503 +static inline int find_first_bit(void * addr, unsigned size)
4505 + unsigned long cmp, count;
4510 + __asm__(" slr %1,%1\n"
4515 + "0: c %1,0(%0,%4)\n"
4521 + "1: l %2,0(%0,%4)\n"
4524 + " tml %2,0xffff\n"
4528 + "2: tml %2,0x00ff\n"
4533 + " ic %2,0(%2,%5)\n"
4536 + : "=&a" (res), "=&d" (cmp), "=&a" (count)
4537 + : "a" (size), "a" (addr), "a" (&_sb_findmap) : "cc" );
4538 + return (res < size) ? res : size;
4541 +static inline int find_next_zero_bit (void * addr, int size, int offset)
4543 unsigned long * p = ((unsigned long *) addr) + (offset >> 5);
4544 unsigned long bitvec, reg;
4545 @@ -680,11 +599,49 @@
4546 return (offset + res);
4549 +static inline int find_next_bit (void * addr, int size, int offset)
4551 + unsigned long * p = ((unsigned long *) addr) + (offset >> 5);
4552 + unsigned long bitvec, reg;
4553 + int set, bit = offset & 31, res;
4557 + * Look for set bit in first word
4559 + bitvec = (*p) >> bit;
4560 + __asm__(" slr %0,%0\n"
4562 + " tml %1,0xffff\n"
4566 + "0: tml %1,0x00ff\n"
4571 + " ic %1,0(%1,%3)\n"
4573 + : "=&d" (set), "+a" (bitvec), "=&d" (reg)
4574 + : "a" (&_sb_findmap) : "cc" );
4575 + if (set < (32 - bit))
4576 + return set + offset;
4577 + offset += 32 - bit;
4581 + * No set bit yet, search remaining full words for a bit
4583 + res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr));
4584 + return (offset + res);
4588 * ffz = Find First Zero in word. Undefined if no zero exists,
4589 * so code should check against ~0UL first..
4591 -static __inline__ unsigned long ffz(unsigned long word)
4592 +static inline unsigned long ffz(unsigned long word)
4596 @@ -708,40 +665,109 @@
4600 + * __ffs = find first bit in word. Undefined if no bit exists,
4601 + * so code should check against 0UL first..
4603 +static inline unsigned long __ffs(unsigned long word)
4605 + unsigned long reg, result;
4607 + __asm__(" slr %0,%0\n"
4609 + " tml %1,0xffff\n"
4613 + "0: tml %1,0x00ff\n"
4618 + " ic %1,0(%1,%3)\n"
4620 + : "=&d" (result), "+a" (word), "=&d" (reg)
4621 + : "a" (&_sb_findmap) : "cc" );
4626 + * Every architecture must define this function. It's the fastest
4627 + * way of searching a 140-bit bitmap where the first 100 bits are
4628 + * unlikely to be set. It's guaranteed that at least one of the 140
4629 + * bits is cleared.
4631 +static inline int sched_find_first_bit(unsigned long *b)
4633 + return find_first_bit(b, 140);
4637 * ffs: find first bit set. This is defined the same way as
4638 * the libc and compiler builtin ffs routines, therefore
4639 * differs in spirit from the above ffz (man ffs).
4642 -extern int __inline__ ffs (int x)
4643 +extern int inline ffs (int x)
4650 - __asm__(" slr %0,%0\n"
4651 - " tml %1,0xffff\n"
4653 + __asm__(" tml %1,0xffff\n"
4658 "0: tml %1,0x00ff\n"
4663 "1: tml %1,0x000f\n"
4668 "2: tml %1,0x0003\n"
4673 "3: tml %1,0x0001\n"
4677 : "=&d" (r), "+d" (x) : : "cc" );
4683 + * fls: find last bit set.
4685 +extern __inline__ int fls(int x)
4691 + __asm__(" tmh %1,0xffff\n"
4695 + "0: tmh %1,0xff00\n"
4699 + "1: tmh %1,0xf000\n"
4703 + "2: tmh %1,0xc000\n"
4707 + "3: tmh %1,0x8000\n"
4711 + : "+d" (r), "+d" (x) : : "cc" );
4717 #define ext2_set_bit(nr, addr) test_and_set_bit((nr)^24, addr)
4718 #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr)^24, addr)
4719 #define ext2_test_bit(nr, addr) test_bit((nr)^24, addr)
4720 -static __inline__ int ext2_find_first_zero_bit(void *vaddr, unsigned size)
4721 +static inline int ext2_find_first_zero_bit(void *vaddr, unsigned size)
4723 unsigned long cmp, count;
4726 return (res < size) ? res : size;
4729 -static __inline__ int
4731 ext2_find_next_zero_bit(void *vaddr, unsigned size, unsigned offset)
4733 unsigned long *addr = vaddr;
4734 diff -urN linux-2.4.22.org/include/asm-s390x/bitops.h linux-2.4.22/include/asm-s390x/bitops.h
4735 --- linux-2.4.22.org/include/asm-s390x/bitops.h 2003-11-24 18:28:36.000000000 +0100
4736 +++ linux-2.4.22/include/asm-s390x/bitops.h 2003-11-24 18:39:03.000000000 +0100
4737 @@ -51,271 +51,220 @@
4738 extern const char _oi_bitmap[];
4739 extern const char _ni_bitmap[];
4740 extern const char _zb_findmap[];
4741 +extern const char _sb_findmap[];
4745 * SMP save set_bit routine based on compare and swap (CS)
4747 -static __inline__ void set_bit_cs(unsigned long nr, volatile void * addr)
4748 +static inline void set_bit_cs(unsigned long nr, volatile void *ptr)
4750 - unsigned long bits, mask;
4751 - __asm__ __volatile__(
4752 + unsigned long addr, old, new, mask;
4754 + addr = (unsigned long) ptr;
4756 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */
4757 - " ngr %2,%1\n" /* isolate last 2 bits of address */
4758 - " xgr %1,%2\n" /* make addr % 4 == 0 */
4760 - " agr %0,%2\n" /* add alignement to bitnr */
4761 + addr ^= addr & 7; /* align address to 8 */
4762 + nr += (addr & 7) << 3; /* add alignment to bit number */
4765 - " nr %2,%0\n" /* make shift value */
4769 - " la %1,0(%0,%1)\n" /* calc. address for CS */
4770 - " sllg %3,%3,0(%2)\n" /* make OR mask */
4772 - "0: lgr %2,%0\n" /* CS loop starts here */
4773 - " ogr %2,%3\n" /* set bit */
4774 - " csg %0,%2,0(%1)\n"
4776 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
4777 - : "cc", "memory" );
4778 + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */
4779 + mask = 1UL << (nr & 63); /* make OR mask */
4784 + " csg %0,%1,0(%4)\n"
4786 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr)
4787 + : "d" (mask), "a" (addr)
4792 * SMP save clear_bit routine based on compare and swap (CS)
4794 -static __inline__ void clear_bit_cs(unsigned long nr, volatile void * addr)
4795 +static inline void clear_bit_cs(unsigned long nr, volatile void *ptr)
4797 - unsigned long bits, mask;
4798 - __asm__ __volatile__(
4799 + unsigned long addr, old, new, mask;
4801 + addr = (unsigned long) ptr;
4803 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */
4804 - " ngr %2,%1\n" /* isolate last 2 bits of address */
4805 - " xgr %1,%2\n" /* make addr % 4 == 0 */
4807 - " agr %0,%2\n" /* add alignement to bitnr */
4808 + addr ^= addr & 7; /* align address to 8 */
4809 + nr += (addr & 7) << 3; /* add alignment to bit number */
4812 - " nr %2,%0\n" /* make shift value */
4816 - " la %1,0(%0,%1)\n" /* calc. address for CS */
4818 - " rllg %3,%3,0(%2)\n" /* make AND mask */
4820 - "0: lgr %2,%0\n" /* CS loop starts here */
4821 - " ngr %2,%3\n" /* clear bit */
4822 - " csg %0,%2,0(%1)\n"
4824 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
4825 - : "cc", "memory" );
4826 + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */
4827 + mask = ~(1UL << (nr & 63)); /* make AND mask */
4832 + " csg %0,%1,0(%4)\n"
4834 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr)
4835 + : "d" (mask), "a" (addr)
4840 * SMP save change_bit routine based on compare and swap (CS)
4842 -static __inline__ void change_bit_cs(unsigned long nr, volatile void * addr)
4843 +static inline void change_bit_cs(unsigned long nr, volatile void *ptr)
4845 - unsigned long bits, mask;
4846 - __asm__ __volatile__(
4847 + unsigned long addr, old, new, mask;
4849 + addr = (unsigned long) ptr;
4851 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */
4852 - " ngr %2,%1\n" /* isolate last 2 bits of address */
4853 - " xgr %1,%2\n" /* make addr % 4 == 0 */
4855 - " agr %0,%2\n" /* add alignement to bitnr */
4856 + addr ^= addr & 7; /* align address to 8 */
4857 + nr += (addr & 7) << 3; /* add alignment to bit number */
4860 - " nr %2,%0\n" /* make shift value */
4864 - " la %1,0(%0,%1)\n" /* calc. address for CS */
4865 - " sllg %3,%3,0(%2)\n" /* make XR mask */
4867 - "0: lgr %2,%0\n" /* CS loop starts here */
4868 - " xgr %2,%3\n" /* change bit */
4869 - " csg %0,%2,0(%1)\n"
4871 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
4872 - : "cc", "memory" );
4873 + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */
4874 + mask = 1UL << (nr & 63); /* make XOR mask */
4879 + " csg %0,%1,0(%4)\n"
4881 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr)
4882 + : "d" (mask), "a" (addr)
4887 * SMP save test_and_set_bit routine based on compare and swap (CS)
4889 -static __inline__ int
4890 -test_and_set_bit_cs(unsigned long nr, volatile void * addr)
4892 +test_and_set_bit_cs(unsigned long nr, volatile void *ptr)
4894 - unsigned long bits, mask;
4895 - __asm__ __volatile__(
4896 + unsigned long addr, old, new, mask;
4898 + addr = (unsigned long) ptr;
4900 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */
4901 - " ngr %2,%1\n" /* isolate last 2 bits of address */
4902 - " xgr %1,%2\n" /* make addr % 4 == 0 */
4904 - " agr %0,%2\n" /* add alignement to bitnr */
4905 + addr ^= addr & 7; /* align address to 8 */
4906 + nr += (addr & 7) << 3; /* add alignment to bit number */
4909 - " nr %2,%0\n" /* make shift value */
4913 - " la %1,0(%0,%1)\n" /* calc. address for CS */
4914 - " sllg %3,%3,0(%2)\n" /* make OR mask */
4916 - "0: lgr %2,%0\n" /* CS loop starts here */
4917 - " ogr %2,%3\n" /* set bit */
4918 - " csg %0,%2,0(%1)\n"
4920 - " ngr %0,%3\n" /* isolate old bit */
4921 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
4922 - : "cc", "memory" );
4924 + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */
4925 + mask = 1UL << (nr & 63); /* make OR/test mask */
4930 + " csg %0,%1,0(%4)\n"
4932 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr)
4933 + : "d" (mask), "a" (addr)
4935 + return (old & mask) != 0;
4939 * SMP save test_and_clear_bit routine based on compare and swap (CS)
4941 -static __inline__ int
4942 -test_and_clear_bit_cs(unsigned long nr, volatile void * addr)
4944 +test_and_clear_bit_cs(unsigned long nr, volatile void *ptr)
4946 - unsigned long bits, mask;
4947 - __asm__ __volatile__(
4948 + unsigned long addr, old, new, mask;
4950 + addr = (unsigned long) ptr;
4952 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */
4953 - " ngr %2,%1\n" /* isolate last 2 bits of address */
4954 - " xgr %1,%2\n" /* make addr % 4 == 0 */
4956 - " agr %0,%2\n" /* add alignement to bitnr */
4957 + addr ^= addr & 7; /* align address to 8 */
4958 + nr += (addr & 7) << 3; /* add alignment to bit number */
4961 - " nr %2,%0\n" /* make shift value */
4965 - " la %1,0(%0,%1)\n" /* calc. address for CS */
4966 - " rllg %3,%3,0(%2)\n" /* make AND mask */
4968 - "0: lgr %2,%0\n" /* CS loop starts here */
4969 - " ngr %2,%3\n" /* clear bit */
4970 - " csg %0,%2,0(%1)\n"
4972 - " xgr %0,%2\n" /* isolate old bit */
4973 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
4974 - : "cc", "memory" );
4976 + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */
4977 + mask = ~(1UL << (nr & 63)); /* make AND mask */
4982 + " csg %0,%1,0(%4)\n"
4984 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr)
4985 + : "d" (mask), "a" (addr)
4987 + return (old ^ new) != 0;
4991 * SMP save test_and_change_bit routine based on compare and swap (CS)
4993 -static __inline__ int
4994 -test_and_change_bit_cs(unsigned long nr, volatile void * addr)
4996 +test_and_change_bit_cs(unsigned long nr, volatile void *ptr)
4998 - unsigned long bits, mask;
4999 - __asm__ __volatile__(
5000 + unsigned long addr, old, new, mask;
5002 + addr = (unsigned long) ptr;
5004 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */
5005 - " ngr %2,%1\n" /* isolate last 2 bits of address */
5006 - " xgr %1,%2\n" /* make addr % 4 == 0 */
5008 - " agr %0,%2\n" /* add alignement to bitnr */
5009 + addr ^= addr & 7; /* align address to 8 */
5010 + nr += (addr & 7) << 3; /* add alignment to bit number */
5013 - " nr %2,%0\n" /* make shift value */
5017 - " la %1,0(%0,%1)\n" /* calc. address for CS */
5018 - " sllg %3,%3,0(%2)\n" /* make OR mask */
5020 - "0: lgr %2,%0\n" /* CS loop starts here */
5021 - " xgr %2,%3\n" /* change bit */
5022 - " csg %0,%2,0(%1)\n"
5024 - " ngr %0,%3\n" /* isolate old bit */
5025 - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) :
5026 - : "cc", "memory" );
5028 + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */
5029 + mask = 1UL << (nr & 63); /* make XOR mask */
5034 + " csg %0,%1,0(%4)\n"
5036 + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr)
5037 + : "d" (mask), "a" (addr)
5039 + return (old & mask) != 0;
5041 #endif /* CONFIG_SMP */
5044 * fast, non-SMP set_bit routine
5046 -static __inline__ void __set_bit(unsigned long nr, volatile void * addr)
5047 +static inline void __set_bit(unsigned long nr, volatile void *ptr)
5049 - unsigned long reg1, reg2;
5050 - __asm__ __volatile__(
5056 - " la %1,0(%1,%3)\n"
5057 - " la %0,0(%0,%4)\n"
5058 - " oc 0(1,%1),0(%0)"
5059 - : "=&a" (reg1), "=&a" (reg2)
5060 - : "a" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" );
5063 -static __inline__ void
5064 -__constant_set_bit(const unsigned long nr, volatile void * addr)
5068 - __asm__ __volatile__ ("la 1,%0\n\t"
5070 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5071 - : : "1", "cc", "memory");
5074 - __asm__ __volatile__ ("la 1,%0\n\t"
5076 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5077 - : : "1", "cc", "memory" );
5080 - __asm__ __volatile__ ("la 1,%0\n\t"
5082 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5083 - : : "1", "cc", "memory" );
5086 - __asm__ __volatile__ ("la 1,%0\n\t"
5088 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5089 - : : "1", "cc", "memory" );
5092 - __asm__ __volatile__ ("la 1,%0\n\t"
5094 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5095 - : : "1", "cc", "memory" );
5098 - __asm__ __volatile__ ("la 1,%0\n\t"
5100 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5101 - : : "1", "cc", "memory" );
5104 - __asm__ __volatile__ ("la 1,%0\n\t"
5106 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5107 - : : "1", "cc", "memory" );
5110 - __asm__ __volatile__ ("la 1,%0\n\t"
5112 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5113 - : : "1", "cc", "memory" );
5116 + unsigned long addr;
5118 + addr = (unsigned long) ptr + ((nr ^ 56) >> 3);
5119 + asm volatile("oc 0(1,%1),0(%2)"
5120 + : "+m" (*(char *) addr)
5121 + : "a" (addr), "a" (_oi_bitmap + (nr & 7))
5126 +__constant_set_bit(const unsigned long nr, volatile void *ptr)
5128 + unsigned long addr;
5130 + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 7);
5133 + asm volatile ("oi 0(%1),0x01"
5134 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5137 + asm volatile ("oi 0(%1),0x02"
5138 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5141 + asm volatile ("oi 0(%1),0x04"
5142 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5145 + asm volatile ("oi 0(%1),0x08"
5146 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5149 + asm volatile ("oi 0(%1),0x10"
5150 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5153 + asm volatile ("oi 0(%1),0x20"
5154 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5157 + asm volatile ("oi 0(%1),0x40"
5158 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5161 + asm volatile ("oi 0(%1),0x80"
5162 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5167 #define set_bit_simple(nr,addr) \
5168 @@ -326,76 +275,58 @@
5170 * fast, non-SMP clear_bit routine
5172 -static __inline__ void
5173 -__clear_bit(unsigned long nr, volatile void * addr)
5175 +__clear_bit(unsigned long nr, volatile void *ptr)
5177 - unsigned long reg1, reg2;
5178 - __asm__ __volatile__(
5184 - " la %1,0(%1,%3)\n"
5185 - " la %0,0(%0,%4)\n"
5186 - " nc 0(1,%1),0(%0)"
5187 - : "=&a" (reg1), "=&a" (reg2)
5188 - : "d" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" );
5191 -static __inline__ void
5192 -__constant_clear_bit(const unsigned long nr, volatile void * addr)
5196 - __asm__ __volatile__ ("la 1,%0\n\t"
5198 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5199 - : : "1", "cc", "memory" );
5202 - __asm__ __volatile__ ("la 1,%0\n\t"
5204 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5205 - : : "1", "cc", "memory" );
5208 - __asm__ __volatile__ ("la 1,%0\n\t"
5210 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5211 - : : "1", "cc", "memory" );
5214 - __asm__ __volatile__ ("la 1,%0\n\t"
5216 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5217 - : : "1", "cc", "memory" );
5220 - __asm__ __volatile__ ("la 1,%0\n\t"
5222 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5223 - : : "cc", "memory" );
5226 - __asm__ __volatile__ ("la 1,%0\n\t"
5228 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5229 - : : "1", "cc", "memory" );
5232 - __asm__ __volatile__ ("la 1,%0\n\t"
5234 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5235 - : : "1", "cc", "memory" );
5238 - __asm__ __volatile__ ("la 1,%0\n\t"
5240 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5241 - : : "1", "cc", "memory" );
5244 + unsigned long addr;
5246 + addr = (unsigned long) ptr + ((nr ^ 56) >> 3);
5247 + asm volatile("nc 0(1,%1),0(%2)"
5248 + : "+m" (*(char *) addr)
5249 + : "a" (addr), "a" (_ni_bitmap + (nr & 7))
5254 +__constant_clear_bit(const unsigned long nr, volatile void *ptr)
5256 + unsigned long addr;
5258 + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 7);
5261 + asm volatile ("ni 0(%1),0xFE"
5262 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5265 + asm volatile ("ni 0(%1),0xFD"
5266 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5269 + asm volatile ("ni 0(%1),0xFB"
5270 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5273 + asm volatile ("ni 0(%1),0xF7"
5274 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5277 + asm volatile ("ni 0(%1),0xEF"
5278 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5281 + asm volatile ("ni 0(%1),0xDF"
5282 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5285 + asm volatile ("ni 0(%1),0xBF"
5286 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5289 + asm volatile ("ni 0(%1),0x7F"
5290 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5295 #define clear_bit_simple(nr,addr) \
5296 @@ -406,75 +337,57 @@
5298 * fast, non-SMP change_bit routine
5300 -static __inline__ void __change_bit(unsigned long nr, volatile void * addr)
5301 +static inline void __change_bit(unsigned long nr, volatile void *ptr)
5303 - unsigned long reg1, reg2;
5304 - __asm__ __volatile__(
5310 - " la %1,0(%1,%3)\n"
5311 - " la %0,0(%0,%4)\n"
5312 - " xc 0(1,%1),0(%0)"
5313 - : "=&a" (reg1), "=&a" (reg2)
5314 - : "d" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" );
5317 -static __inline__ void
5318 -__constant_change_bit(const unsigned long nr, volatile void * addr)
5322 - __asm__ __volatile__ ("la 1,%0\n\t"
5324 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5325 - : : "cc", "memory" );
5328 - __asm__ __volatile__ ("la 1,%0\n\t"
5330 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5331 - : : "cc", "memory" );
5334 - __asm__ __volatile__ ("la 1,%0\n\t"
5336 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5337 - : : "cc", "memory" );
5340 - __asm__ __volatile__ ("la 1,%0\n\t"
5342 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5343 - : : "cc", "memory" );
5346 - __asm__ __volatile__ ("la 1,%0\n\t"
5348 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5349 - : : "cc", "memory" );
5352 - __asm__ __volatile__ ("la 1,%0\n\t"
5354 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5355 - : : "1", "cc", "memory" );
5358 - __asm__ __volatile__ ("la 1,%0\n\t"
5360 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5361 - : : "1", "cc", "memory" );
5364 - __asm__ __volatile__ ("la 1,%0\n\t"
5366 - : "=m" (*((volatile char *) addr + ((nr>>3)^7)))
5367 - : : "1", "cc", "memory" );
5370 + unsigned long addr;
5372 + addr = (unsigned long) ptr + ((nr ^ 56) >> 3);
5373 + asm volatile("xc 0(1,%1),0(%2)"
5374 + : "+m" (*(char *) addr)
5375 + : "a" (addr), "a" (_oi_bitmap + (nr & 7))
5380 +__constant_change_bit(const unsigned long nr, volatile void *ptr)
5382 + unsigned long addr;
5384 + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 7);
5387 + asm volatile ("xi 0(%1),0x01"
5388 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5391 + asm volatile ("xi 0(%1),0x02"
5392 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5395 + asm volatile ("xi 0(%1),0x04"
5396 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5399 + asm volatile ("xi 0(%1),0x08"
5400 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5403 + asm volatile ("xi 0(%1),0x10"
5404 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5407 + asm volatile ("xi 0(%1),0x20"
5408 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5411 + asm volatile ("xi 0(%1),0x40"
5412 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5415 + asm volatile ("xi 0(%1),0x80"
5416 + : "+m" (*(char *) addr) : "a" (addr) : "cc" );
5421 #define change_bit_simple(nr,addr) \
5422 @@ -485,77 +398,57 @@
5424 * fast, non-SMP test_and_set_bit routine
5426 -static __inline__ int
5427 -test_and_set_bit_simple(unsigned long nr, volatile void * addr)
5429 +test_and_set_bit_simple(unsigned long nr, volatile void *ptr)
5431 - unsigned long reg1, reg2;
5433 - __asm__ __volatile__(
5439 - " la %1,0(%1,%4)\n"
5442 - " la %2,0(%2,%5)\n"
5443 - " oc 0(1,%1),0(%2)"
5444 - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2)
5445 - : "d" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" );
5446 - return oldbit & 1;
5447 + unsigned long addr;
5450 + addr = (unsigned long) ptr + ((nr ^ 56) >> 3);
5451 + ch = *(unsigned char *) addr;
5452 + asm volatile("oc 0(1,%1),0(%2)"
5453 + : "+m" (*(char *) addr)
5454 + : "a" (addr), "a" (_oi_bitmap + (nr & 7))
5456 + return (ch >> (nr & 7)) & 1;
5458 #define __test_and_set_bit(X,Y) test_and_set_bit_simple(X,Y)
5461 * fast, non-SMP test_and_clear_bit routine
5463 -static __inline__ int
5464 -test_and_clear_bit_simple(unsigned long nr, volatile void * addr)
5466 +test_and_clear_bit_simple(unsigned long nr, volatile void *ptr)
5468 - unsigned long reg1, reg2;
5470 + unsigned long addr;
5473 - __asm__ __volatile__(
5479 - " la %1,0(%1,%4)\n"
5482 - " la %2,0(%2,%5)\n"
5483 - " nc 0(1,%1),0(%2)"
5484 - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2)
5485 - : "d" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" );
5486 - return oldbit & 1;
5487 + addr = (unsigned long) ptr + ((nr ^ 56) >> 3);
5488 + ch = *(unsigned char *) addr;
5489 + asm volatile("nc 0(1,%1),0(%2)"
5490 + : "+m" (*(char *) addr)
5491 + : "a" (addr), "a" (_ni_bitmap + (nr & 7))
5493 + return (ch >> (nr & 7)) & 1;
5495 #define __test_and_clear_bit(X,Y) test_and_clear_bit_simple(X,Y)
5498 * fast, non-SMP test_and_change_bit routine
5500 -static __inline__ int
5501 -test_and_change_bit_simple(unsigned long nr, volatile void * addr)
5503 +test_and_change_bit_simple(unsigned long nr, volatile void *ptr)
5505 - unsigned long reg1, reg2;
5507 + unsigned long addr;
5510 - __asm__ __volatile__(
5516 - " la %1,0(%1,%4)\n"
5519 - " la %2,0(%2,%5)\n"
5520 - " xc 0(1,%1),0(%2)"
5521 - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2)
5522 - : "d" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" );
5523 - return oldbit & 1;
5524 + addr = (unsigned long) ptr + ((nr ^ 56) >> 3);
5525 + ch = *(unsigned char *) addr;
5526 + asm volatile("xc 0(1,%1),0(%2)"
5527 + : "+m" (*(char *) addr)
5528 + : "a" (addr), "a" (_oi_bitmap + (nr & 7))
5530 + return (ch >> (nr & 7)) & 1;
5532 #define __test_and_change_bit(X,Y) test_and_change_bit_simple(X,Y)
5534 @@ -580,26 +473,18 @@
5535 * This routine doesn't need to be atomic.
5538 -static __inline__ int __test_bit(unsigned long nr, volatile void * addr)
5539 +static inline int __test_bit(unsigned long nr, volatile void *ptr)
5541 - unsigned long reg1, reg2;
5543 + unsigned long addr;
5546 - __asm__ __volatile__(
5552 - " ic %0,0(%2,%4)\n"
5554 - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2)
5555 - : "d" (nr), "a" (addr) : "cc" );
5556 - return oldbit & 1;
5557 + addr = (unsigned long) ptr + ((nr ^ 56) >> 3);
5558 + ch = *(unsigned char *) addr;
5559 + return (ch >> (nr & 7)) & 1;
5562 -static __inline__ int
5563 -__constant_test_bit(unsigned long nr, volatile void * addr) {
5565 +__constant_test_bit(unsigned long nr, volatile void *addr) {
5566 return (((volatile char *) addr)[(nr>>3)^7] & (1<<(nr&7))) != 0;
5571 * Find-bit routines..
5573 -static __inline__ unsigned long
5574 +static inline unsigned long
5575 find_first_zero_bit(void * addr, unsigned long size)
5577 unsigned long res, cmp, count;
5578 @@ -653,7 +538,49 @@
5579 return (res < size) ? res : size;
5582 -static __inline__ unsigned long
5583 +static inline unsigned long
5584 +find_first_bit(void * addr, unsigned long size)
5586 + unsigned long res, cmp, count;
5590 + __asm__(" slgr %1,%1\n"
5595 + "0: cg %1,0(%0,%4)\n"
5601 + "1: lg %2,0(%0,%4)\n"
5606 + " srlg %2,%2,32\n"
5607 + "2: lghi %1,0xff\n"
5608 + " tmll %2,0xffff\n"
5612 + "3: tmll %2,0x00ff\n"
5617 + " ic %2,0(%2,%5)\n"
5620 + : "=&a" (res), "=&d" (cmp), "=&a" (count)
5621 + : "a" (size), "a" (addr), "a" (&_sb_findmap) : "cc" );
5622 + return (res < size) ? res : size;
5625 +static inline unsigned long
5626 find_next_zero_bit (void * addr, unsigned long size, unsigned long offset)
5628 unsigned long * p = ((unsigned long *) addr) + (offset >> 6);
5629 @@ -697,14 +624,56 @@
5630 return (offset + res);
5633 +static inline unsigned long
5634 +find_next_bit (void * addr, unsigned long size, unsigned long offset)
5636 + unsigned long * p = ((unsigned long *) addr) + (offset >> 6);
5637 + unsigned long bitvec, reg;
5638 + unsigned long set, bit = offset & 63, res;
5642 + * Look for zero in first word
5644 + bitvec = (*p) >> bit;
5645 + __asm__(" slgr %0,%0\n"
5649 + " srlg %1,%1,32\n"
5650 + "0: lghi %2,0xff\n"
5651 + " tmll %1,0xffff\n"
5654 + " srlg %1,%1,16\n"
5655 + "1: tmll %1,0x00ff\n"
5660 + " ic %1,0(%1,%3)\n"
5662 + : "=&d" (set), "+a" (bitvec), "=&d" (reg)
5663 + : "a" (&_sb_findmap) : "cc" );
5664 + if (set < (64 - bit))
5665 + return set + offset;
5666 + offset += 64 - bit;
5670 + * No set bit yet, search remaining full words for a bit
5672 + res = find_first_bit (p, size - 64 * (p - (unsigned long *) addr));
5673 + return (offset + res);
5677 * ffz = Find First Zero in word. Undefined if no zero exists,
5678 * so code should check against ~0UL first..
5680 -static __inline__ unsigned long ffz(unsigned long word)
5681 +static inline unsigned long ffz(unsigned long word)
5683 - unsigned long reg;
5685 + unsigned long reg, result;
5687 __asm__(" lhi %2,-1\n"
5689 @@ -730,40 +699,112 @@
5693 + * __ffs = find first bit in word. Undefined if no bit exists,
5694 + * so code should check against 0UL first..
5696 +static inline unsigned long __ffs (unsigned long word)
5698 + unsigned long reg, result;
5700 + __asm__(" slgr %0,%0\n"
5704 + " srlg %1,%1,32\n"
5705 + "0: lghi %2,0xff\n"
5706 + " tmll %1,0xffff\n"
5709 + " srlg %1,%1,16\n"
5710 + "1: tmll %1,0x00ff\n"
5715 + " ic %1,0(%1,%3)\n"
5717 + : "=&d" (result), "+a" (word), "=&d" (reg)
5718 + : "a" (&_sb_findmap) : "cc" );
5723 + * Every architecture must define this function. It's the fastest
5724 + * way of searching a 140-bit bitmap where the first 100 bits are
5725 + * unlikely to be set. It's guaranteed that at least one of the 140
5726 + * bits is cleared.
5728 +static inline int sched_find_first_bit(unsigned long *b)
5730 + return find_first_bit(b, 140);
5734 * ffs: find first bit set. This is defined the same way as
5735 * the libc and compiler builtin ffs routines, therefore
5736 * differs in spirit from the above ffz (man ffs).
5739 -extern int __inline__ ffs (int x)
5740 +extern int inline ffs (int x)
5747 - __asm__(" slr %0,%0\n"
5748 - " tml %1,0xffff\n"
5750 + __asm__(" tml %1,0xffff\n"
5755 "0: tml %1,0x00ff\n"
5760 "1: tml %1,0x000f\n"
5765 "2: tml %1,0x0003\n"
5770 "3: tml %1,0x0001\n"
5774 : "=&d" (r), "+d" (x) : : "cc" );
5780 + * fls: find last bit set.
5782 +extern __inline__ int fls(int x)
5788 + __asm__(" tmh %1,0xffff\n"
5792 + "0: tmh %1,0xff00\n"
5796 + "1: tmh %1,0xf000\n"
5800 + "2: tmh %1,0xc000\n"
5804 + "3: tmh %1,0x8000\n"
5808 + : "+d" (r), "+d" (x) : : "cc" );
5814 #define ext2_set_bit(nr, addr) test_and_set_bit((nr)^56, addr)
5815 #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr)^56, addr)
5816 #define ext2_test_bit(nr, addr) test_bit((nr)^56, addr)
5817 -static __inline__ unsigned long
5818 +static inline unsigned long
5819 ext2_find_first_zero_bit(void *vaddr, unsigned long size)
5821 unsigned long res, cmp, count;
5823 return (res < size) ? res : size;
5826 -static __inline__ unsigned long
5827 +static inline unsigned long
5828 ext2_find_next_zero_bit(void *vaddr, unsigned long size, unsigned long offset)
5830 unsigned long *addr = vaddr;
5831 diff -urN linux-2.4.22.org/include/asm-sparc/bitops.h linux-2.4.22/include/asm-sparc/bitops.h
5832 --- linux-2.4.22.org/include/asm-sparc/bitops.h 2003-11-24 18:28:27.000000000 +0100
5833 +++ linux-2.4.22/include/asm-sparc/bitops.h 2003-11-24 18:39:03.000000000 +0100
5834 @@ -231,6 +231,57 @@
5839 + * __ffs - find first bit in word.
5840 + * @word: The word to search
5842 + * Undefined if no bit exists, so code should check against 0 first.
5844 +static __inline__ int __ffs(unsigned long word)
5848 + if ((word & 0xffff) == 0) {
5852 + if ((word & 0xff) == 0) {
5856 + if ((word & 0xf) == 0) {
5860 + if ((word & 0x3) == 0) {
5864 + if ((word & 0x1) == 0)
5870 + * Every architecture must define this function. It's the fastest
5871 + * way of searching a 140-bit bitmap where the first 100 bits are
5872 + * unlikely to be set. It's guaranteed that at least one of the 140
5873 + * bits is cleared.
5875 +static __inline__ int sched_find_first_bit(unsigned long *b)
5878 + if (unlikely(b[0]))
5879 + return __ffs(b[0]);
5880 + if (unlikely(b[1]))
5881 + return __ffs(b[1]) + 32;
5882 + if (unlikely(b[2]))
5883 + return __ffs(b[2]) + 64;
5885 + return __ffs(b[3]) + 96;
5886 + return __ffs(b[4]) + 128;
5890 * ffs: find first bit set. This is defined the same way as
5891 * the libc and compiler builtin ffs routines, therefore
5892 @@ -296,6 +347,32 @@
5893 #define find_first_zero_bit(addr, size) \
5894 find_next_zero_bit((addr), (size), 0)
5897 + * find_next_bit - find the first set bit in a memory region
5898 + * @addr: The address to base the search on
5899 + * @offset: The bitnumber to start searching at
5900 + * @size: The maximum size to search
5902 + * Scheduler induced bitop, do not use.
5904 +static inline int find_next_bit(unsigned long *addr, int size, int offset)
5906 + unsigned long *p = addr + (offset >> 5);
5907 + int num = offset & ~0x1f;
5908 + unsigned long word;
5911 + word &= ~((1 << (offset & 0x1f)) - 1);
5912 + while (num < size) {
5914 + return __ffs(word) + num;
5922 static inline int test_le_bit(int nr, __const__ void * addr)
5924 __const__ unsigned char *ADDR = (__const__ unsigned char *) addr;
5925 diff -urN linux-2.4.22.org/include/asm-sparc/system.h linux-2.4.22/include/asm-sparc/system.h
5926 --- linux-2.4.22.org/include/asm-sparc/system.h 2003-11-24 18:28:27.000000000 +0100
5927 +++ linux-2.4.22/include/asm-sparc/system.h 2003-11-24 18:39:03.000000000 +0100
5930 * SWITCH_ENTER and SWITH_DO_LAZY_FPU do not work yet (e.g. SMP does not work)
5932 -#define prepare_to_switch() do { \
5933 +#define prepare_arch_switch(rq, next) do { \
5934 __asm__ __volatile__( \
5935 ".globl\tflush_patch_switch\nflush_patch_switch:\n\t" \
5936 "save %sp, -0x40, %sp; save %sp, -0x40, %sp; save %sp, -0x40, %sp\n\t" \
5938 "save %sp, -0x40, %sp\n\t" \
5939 "restore; restore; restore; restore; restore; restore; restore"); \
5941 +#define finish_arch_switch(rq, next) do{ }while(0)
5942 +#define task_running(rq, p) ((rq)->curr == (p))
5944 /* Much care has gone into this code, do not touch it.
5946 diff -urN linux-2.4.22.org/include/asm-sparc64/bitops.h linux-2.4.22/include/asm-sparc64/bitops.h
5947 --- linux-2.4.22.org/include/asm-sparc64/bitops.h 2003-11-24 18:28:29.000000000 +0100
5948 +++ linux-2.4.22/include/asm-sparc64/bitops.h 2003-11-24 18:39:03.000000000 +0100
5952 * bitops.h: Bit string operations on the V9.
5954 * Copyright 1996, 1997 David S. Miller (davem@caip.rutgers.edu)
5956 #ifndef _SPARC64_BITOPS_H
5957 #define _SPARC64_BITOPS_H
5959 +#include <linux/compiler.h>
5960 #include <asm/byteorder.h>
5962 -extern long ___test_and_set_bit(unsigned long nr, volatile void *addr);
5963 -extern long ___test_and_clear_bit(unsigned long nr, volatile void *addr);
5964 -extern long ___test_and_change_bit(unsigned long nr, volatile void *addr);
5965 +extern long ___test_and_set_bit(unsigned long nr, volatile unsigned long *addr);
5966 +extern long ___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr);
5967 +extern long ___test_and_change_bit(unsigned long nr, volatile unsigned long *addr);
5969 #define test_and_set_bit(nr,addr) ({___test_and_set_bit(nr,addr)!=0;})
5970 #define test_and_clear_bit(nr,addr) ({___test_and_clear_bit(nr,addr)!=0;})
5971 @@ -21,109 +22,132 @@
5972 #define change_bit(nr,addr) ((void)___test_and_change_bit(nr,addr))
5974 /* "non-atomic" versions... */
5975 -#define __set_bit(X,Y) \
5976 -do { unsigned long __nr = (X); \
5977 - long *__m = ((long *) (Y)) + (__nr >> 6); \
5978 - *__m |= (1UL << (__nr & 63)); \
5980 -#define __clear_bit(X,Y) \
5981 -do { unsigned long __nr = (X); \
5982 - long *__m = ((long *) (Y)) + (__nr >> 6); \
5983 - *__m &= ~(1UL << (__nr & 63)); \
5985 -#define __change_bit(X,Y) \
5986 -do { unsigned long __nr = (X); \
5987 - long *__m = ((long *) (Y)) + (__nr >> 6); \
5988 - *__m ^= (1UL << (__nr & 63)); \
5990 -#define __test_and_set_bit(X,Y) \
5991 -({ unsigned long __nr = (X); \
5992 - long *__m = ((long *) (Y)) + (__nr >> 6); \
5993 - long __old = *__m; \
5994 - long __mask = (1UL << (__nr & 63)); \
5995 - *__m = (__old | __mask); \
5996 - ((__old & __mask) != 0); \
5998 -#define __test_and_clear_bit(X,Y) \
5999 -({ unsigned long __nr = (X); \
6000 - long *__m = ((long *) (Y)) + (__nr >> 6); \
6001 - long __old = *__m; \
6002 - long __mask = (1UL << (__nr & 63)); \
6003 - *__m = (__old & ~__mask); \
6004 - ((__old & __mask) != 0); \
6006 -#define __test_and_change_bit(X,Y) \
6007 -({ unsigned long __nr = (X); \
6008 - long *__m = ((long *) (Y)) + (__nr >> 6); \
6009 - long __old = *__m; \
6010 - long __mask = (1UL << (__nr & 63)); \
6011 - *__m = (__old ^ __mask); \
6012 - ((__old & __mask) != 0); \
6015 +static __inline__ void __set_bit(int nr, volatile unsigned long *addr)
6017 + volatile unsigned long *m = addr + (nr >> 6);
6019 + *m |= (1UL << (nr & 63));
6022 +static __inline__ void __clear_bit(int nr, volatile unsigned long *addr)
6024 + volatile unsigned long *m = addr + (nr >> 6);
6026 + *m &= ~(1UL << (nr & 63));
6029 +static __inline__ void __change_bit(int nr, volatile unsigned long *addr)
6031 + volatile unsigned long *m = addr + (nr >> 6);
6033 + *m ^= (1UL << (nr & 63));
6036 +static __inline__ int __test_and_set_bit(int nr, volatile unsigned long *addr)
6038 + volatile unsigned long *m = addr + (nr >> 6);
6040 + long mask = (1UL << (nr & 63));
6042 + *m = (old | mask);
6043 + return ((old & mask) != 0);
6046 +static __inline__ int __test_and_clear_bit(int nr, volatile unsigned long *addr)
6048 + volatile unsigned long *m = addr + (nr >> 6);
6050 + long mask = (1UL << (nr & 63));
6052 + *m = (old & ~mask);
6053 + return ((old & mask) != 0);
6056 +static __inline__ int __test_and_change_bit(int nr, volatile unsigned long *addr)
6058 + volatile unsigned long *m = addr + (nr >> 6);
6060 + long mask = (1UL << (nr & 63));
6062 + *m = (old ^ mask);
6063 + return ((old & mask) != 0);
6066 #define smp_mb__before_clear_bit() do { } while(0)
6067 #define smp_mb__after_clear_bit() do { } while(0)
6069 -extern __inline__ int test_bit(int nr, __const__ void *addr)
6070 +static __inline__ int test_bit(int nr, __const__ volatile unsigned long *addr)
6072 - return (1UL & (((__const__ long *) addr)[nr >> 6] >> (nr & 63))) != 0UL;
6073 + return (1UL & ((addr)[nr >> 6] >> (nr & 63))) != 0UL;
6076 /* The easy/cheese version for now. */
6077 -extern __inline__ unsigned long ffz(unsigned long word)
6078 +static __inline__ unsigned long ffz(unsigned long word)
6080 unsigned long result;
6082 -#ifdef ULTRA_HAS_POPULATION_COUNT /* Thanks for nothing Sun... */
6083 - __asm__ __volatile__(
6086 -" xnor %0, %%g1, %%g2\n"
6088 -"1: " : "=&r" (result)
6092 -#if 1 /* def EASY_CHEESE_VERSION */
6099 - unsigned long tmp;
6104 - tmp = ~word & -~word;
6105 - if (!(unsigned)tmp) {
6109 - if (!(unsigned short)tmp) {
6113 - if (!(unsigned char)tmp) {
6117 + * __ffs - find first bit in word.
6118 + * @word: The word to search
6120 + * Undefined if no bit exists, so code should check against 0 first.
6122 +static __inline__ unsigned long __ffs(unsigned long word)
6124 + unsigned long result = 0;
6126 + while (!(word & 1UL)) {
6130 - if (tmp & 0xf0) result += 4;
6131 - if (tmp & 0xcc) result += 2;
6132 - if (tmp & 0xaa) result ++;
6139 + * fls: find last bit set.
6142 +#define fls(x) generic_fls(x)
6147 + * Every architecture must define this function. It's the fastest
6148 + * way of searching a 140-bit bitmap where the first 100 bits are
6149 + * unlikely to be set. It's guaranteed that at least one of the 140
6150 + * bits is cleared.
6152 +static inline int sched_find_first_bit(unsigned long *b)
6154 + if (unlikely(b[0]))
6155 + return __ffs(b[0]);
6156 + if (unlikely(((unsigned int)b[1])))
6157 + return __ffs(b[1]) + 64;
6159 + return __ffs(b[1] >> 32) + 96;
6160 + return __ffs(b[2]) + 128;
6164 * ffs: find first bit set. This is defined the same way as
6165 * the libc and compiler builtin ffs routines, therefore
6166 * differs in spirit from the above ffz (man ffs).
6169 -#define ffs(x) generic_ffs(x)
6170 +static __inline__ int ffs(int x)
6174 + return __ffs((unsigned long)x);
6178 * hweightN: returns the hamming weight (i.e. the number
6181 #ifdef ULTRA_HAS_POPULATION_COUNT
6183 -extern __inline__ unsigned int hweight32(unsigned int w)
6184 +static __inline__ unsigned int hweight32(unsigned int w)
6192 -extern __inline__ unsigned int hweight16(unsigned int w)
6193 +static __inline__ unsigned int hweight16(unsigned int w)
6201 -extern __inline__ unsigned int hweight8(unsigned int w)
6202 +static __inline__ unsigned int hweight8(unsigned int w)
6206 @@ -165,14 +189,69 @@
6208 #endif /* __KERNEL__ */
6211 + * find_next_bit - find the next set bit in a memory region
6212 + * @addr: The address to base the search on
6213 + * @offset: The bitnumber to start searching at
6214 + * @size: The maximum size to search
6216 +static __inline__ unsigned long find_next_bit(unsigned long *addr, unsigned long size, unsigned long offset)
6218 + unsigned long *p = addr + (offset >> 6);
6219 + unsigned long result = offset & ~63UL;
6220 + unsigned long tmp;
6222 + if (offset >= size)
6228 + tmp &= (~0UL << offset);
6232 + goto found_middle;
6236 + while (size & ~63UL) {
6237 + if ((tmp = *(p++)))
6238 + goto found_middle;
6247 + tmp &= (~0UL >> (64 - size));
6248 + if (tmp == 0UL) /* Are any bits set? */
6249 + return result + size; /* Nope. */
6251 + return result + __ffs(tmp);
6255 + * find_first_bit - find the first set bit in a memory region
6256 + * @addr: The address to start the search at
6257 + * @size: The maximum size to search
6259 + * Returns the bit-number of the first set bit, not the number of the byte
6260 + * containing a bit.
6262 +#define find_first_bit(addr, size) \
6263 + find_next_bit((addr), (size), 0)
6265 /* find_next_zero_bit() finds the first zero bit in a bit string of length
6266 * 'size' bits, starting the search at bit 'offset'. This is largely based
6267 * on Linus's ALPHA routines, which are pretty portable BTW.
6270 -extern __inline__ unsigned long find_next_zero_bit(void *addr, unsigned long size, unsigned long offset)
6271 +static __inline__ unsigned long find_next_zero_bit(unsigned long *addr, unsigned long size, unsigned long offset)
6273 - unsigned long *p = ((unsigned long *) addr) + (offset >> 6);
6274 + unsigned long *p = addr + (offset >> 6);
6275 unsigned long result = offset & ~63UL;
6278 @@ -211,15 +290,15 @@
6279 #define find_first_zero_bit(addr, size) \
6280 find_next_zero_bit((addr), (size), 0)
6282 -extern long ___test_and_set_le_bit(int nr, volatile void *addr);
6283 -extern long ___test_and_clear_le_bit(int nr, volatile void *addr);
6284 +extern long ___test_and_set_le_bit(int nr, volatile unsigned long *addr);
6285 +extern long ___test_and_clear_le_bit(int nr, volatile unsigned long *addr);
6287 #define test_and_set_le_bit(nr,addr) ({___test_and_set_le_bit(nr,addr)!=0;})
6288 #define test_and_clear_le_bit(nr,addr) ({___test_and_clear_le_bit(nr,addr)!=0;})
6289 #define set_le_bit(nr,addr) ((void)___test_and_set_le_bit(nr,addr))
6290 #define clear_le_bit(nr,addr) ((void)___test_and_clear_le_bit(nr,addr))
6292 -extern __inline__ int test_le_bit(int nr, __const__ void * addr)
6293 +static __inline__ int test_le_bit(int nr, __const__ unsigned long * addr)
6296 __const__ unsigned char *ADDR = (__const__ unsigned char *) addr;
6298 #define find_first_zero_le_bit(addr, size) \
6299 find_next_zero_le_bit((addr), (size), 0)
6301 -extern __inline__ unsigned long find_next_zero_le_bit(void *addr, unsigned long size, unsigned long offset)
6302 +static __inline__ unsigned long find_next_zero_le_bit(unsigned long *addr, unsigned long size, unsigned long offset)
6304 - unsigned long *p = ((unsigned long *) addr) + (offset >> 6);
6305 + unsigned long *p = addr + (offset >> 6);
6306 unsigned long result = offset & ~63UL;
6309 @@ -271,18 +350,22 @@
6313 -#define ext2_set_bit test_and_set_le_bit
6314 -#define ext2_clear_bit test_and_clear_le_bit
6315 -#define ext2_test_bit test_le_bit
6316 -#define ext2_find_first_zero_bit find_first_zero_le_bit
6317 -#define ext2_find_next_zero_bit find_next_zero_le_bit
6318 +#define ext2_set_bit(nr,addr) test_and_set_le_bit((nr),(unsigned long *)(addr))
6319 +#define ext2_clear_bit(nr,addr) test_and_clear_le_bit((nr),(unsigned long *)(addr))
6320 +#define ext2_test_bit(nr,addr) test_le_bit((nr),(unsigned long *)(addr))
6321 +#define ext2_find_first_zero_bit(addr, size) \
6322 + find_first_zero_le_bit((unsigned long *)(addr), (size))
6323 +#define ext2_find_next_zero_bit(addr, size, off) \
6324 + find_next_zero_le_bit((unsigned long *)(addr), (size), (off))
6326 /* Bitmap functions for the minix filesystem. */
6327 -#define minix_test_and_set_bit(nr,addr) test_and_set_bit(nr,addr)
6328 -#define minix_set_bit(nr,addr) set_bit(nr,addr)
6329 -#define minix_test_and_clear_bit(nr,addr) test_and_clear_bit(nr,addr)
6330 -#define minix_test_bit(nr,addr) test_bit(nr,addr)
6331 -#define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size)
6332 +#define minix_test_and_set_bit(nr,addr) test_and_set_bit((nr),(unsigned long *)(addr))
6333 +#define minix_set_bit(nr,addr) set_bit((nr),(unsigned long *)(addr))
6334 +#define minix_test_and_clear_bit(nr,addr) \
6335 + test_and_clear_bit((nr),(unsigned long *)(addr))
6336 +#define minix_test_bit(nr,addr) test_bit((nr),(unsigned long *)(addr))
6337 +#define minix_find_first_zero_bit(addr,size) \
6338 + find_first_zero_bit((unsigned long *)(addr),(size))
6340 #endif /* __KERNEL__ */
6342 diff -urN linux-2.4.22.org/include/asm-sparc64/smp.h linux-2.4.22/include/asm-sparc64/smp.h
6343 --- linux-2.4.22.org/include/asm-sparc64/smp.h 2003-11-24 18:28:29.000000000 +0100
6344 +++ linux-2.4.22/include/asm-sparc64/smp.h 2003-11-24 18:39:03.000000000 +0100
6349 -#define smp_processor_id() (current->processor)
6350 +#define smp_processor_id() (current->cpu)
6352 /* This needn't do anything as we do not sleep the cpu
6353 * inside of the idler task, so an interrupt is not needed
6354 diff -urN linux-2.4.22.org/include/asm-sparc64/system.h linux-2.4.22/include/asm-sparc64/system.h
6355 --- linux-2.4.22.org/include/asm-sparc64/system.h 2003-11-24 18:28:29.000000000 +0100
6356 +++ linux-2.4.22/include/asm-sparc64/system.h 2003-11-24 18:39:03.000000000 +0100
6357 @@ -154,7 +154,18 @@
6359 #define flush_user_windows flushw_user
6360 #define flush_register_windows flushw_all
6361 -#define prepare_to_switch flushw_all
6363 +#define prepare_arch_schedule(prev) task_lock(prev)
6364 +#define finish_arch_schedule(prev) task_unlock(prev)
6365 +#define prepare_arch_switch(rq, next) \
6366 +do { spin_lock(&(next)->switch_lock); \
6367 + spin_unlock(&(rq)->lock); \
6371 +#define finish_arch_switch(rq, prev) \
6372 +do { spin_unlock_irq(&(prev)->switch_lock); \
6375 #ifndef CONFIG_DEBUG_SPINLOCK
6376 #define CHECK_LOCKS(PREV) do { } while(0)
6377 diff -urN linux-2.4.22.org/include/linux/bitops.h linux-2.4.22/include/linux/bitops.h
6378 --- linux-2.4.22.org/include/linux/bitops.h 2003-11-24 18:28:22.000000000 +0100
6379 +++ linux-2.4.22/include/linux/bitops.h 2003-11-24 18:40:50.000000000 +0100
6381 #ifndef _LINUX_BITOPS_H
6382 #define _LINUX_BITOPS_H
6385 + * fls: find last bit set.
6388 +extern __inline__ int generic_fls(int x)
6394 + if (!(x & 0xffff0000u)) {
6398 + if (!(x & 0xff000000u)) {
6402 + if (!(x & 0xf0000000u)) {
6406 + if (!(x & 0xc0000000u)) {
6410 + if (!(x & 0x80000000u)) {
6418 * ffs: find first bit set. This is defined the same way as
6419 diff -urN linux-2.4.22.org/include/linux/kernel_stat.h linux-2.4.22/include/linux/kernel_stat.h
6420 --- linux-2.4.22.org/include/linux/kernel_stat.h 2003-11-24 18:28:20.000000000 +0100
6421 +++ linux-2.4.22/include/linux/kernel_stat.h 2003-11-24 18:39:03.000000000 +0100
6423 #elif !defined(CONFIG_ARCH_S390)
6424 unsigned int irqs[NR_CPUS][NR_IRQS];
6426 - unsigned int context_swtch;
6429 extern struct kernel_stat kstat;
6430 diff -urN linux-2.4.22.org/include/linux/sched.h linux-2.4.22/include/linux/sched.h
6431 --- linux-2.4.22.org/include/linux/sched.h 2003-11-24 18:28:20.000000000 +0100
6432 +++ linux-2.4.22/include/linux/sched.h 2003-11-24 18:39:03.000000000 +0100
6434 extern unsigned long event;
6436 #include <linux/config.h>
6437 +#include <linux/compiler.h>
6438 #include <linux/binfmts.h>
6439 #include <linux/threads.h>
6440 #include <linux/kernel.h>
6442 #include <asm/mmu.h>
6444 #include <linux/smp.h>
6445 -#include <linux/tty.h>
6446 +//#include <linux/tty.h>
6447 #include <linux/sem.h>
6448 #include <linux/signal.h>
6449 #include <linux/securebits.h>
6451 #define CT_TO_SECS(x) ((x) / HZ)
6452 #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ)
6454 -extern int nr_running, nr_threads;
6455 +extern int nr_threads;
6456 extern int last_pid;
6457 +extern unsigned long nr_running(void);
6458 +extern unsigned long nr_uninterruptible(void);
6460 -#include <linux/fs.h>
6461 +//#include <linux/fs.h>
6462 #include <linux/time.h>
6463 #include <linux/param.h>
6464 #include <linux/resource.h>
6465 @@ -109,12 +112,6 @@
6466 #define SCHED_FIFO 1
6470 - * This is an additional bit set when we want to
6471 - * yield the CPU for one re-schedule..
6473 -#define SCHED_YIELD 0x10
6475 struct sched_param {
6478 @@ -132,17 +129,21 @@
6481 extern rwlock_t tasklist_lock;
6482 -extern spinlock_t runqueue_lock;
6483 extern spinlock_t mmlist_lock;
6485 +typedef struct task_struct task_t;
6487 extern void sched_init(void);
6488 -extern void init_idle(void);
6489 +extern void init_idle(task_t *idle, int cpu);
6490 extern void show_state(void);
6491 extern void cpu_init (void);
6492 extern void trap_init(void);
6493 extern void update_process_times(int user);
6494 -extern void update_one_process(struct task_struct *p, unsigned long user,
6495 +extern void update_one_process(task_t *p, unsigned long user,
6496 unsigned long system, int cpu);
6497 +extern void scheduler_tick(int user_tick, int system);
6498 +extern void migration_init(void);
6499 +extern unsigned long cache_decay_ticks;
6501 #define MAX_SCHEDULE_TIMEOUT LONG_MAX
6502 extern signed long FASTCALL(schedule_timeout(signed long timeout));
6503 @@ -152,6 +153,28 @@
6504 extern void flush_scheduled_tasks(void);
6505 extern int start_context_thread(void);
6506 extern int current_is_keventd(void);
6507 +extern void FASTCALL(sched_exit(task_t * p));
6508 +extern int FASTCALL(idle_cpu(int cpu));
6511 + * Priority of a process goes from 0..MAX_PRIO-1, valid RT
6512 + * priority is 0..MAX_RT_PRIO-1, and SCHED_OTHER tasks are
6513 + * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values
6514 + * are inverted: lower p->prio value means higher priority.
6516 + * The MAX_RT_USER_PRIO value allows the actual maximum
6517 + * RT priority to be separate from the value exported to
6518 + * user-space. This allows kernel threads to set their
6519 + * priority to a value higher than any user task. Note:
6520 + * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
6522 + * Both values are configurable at compile-time.
6525 +#define MAX_USER_RT_PRIO 100
6526 +#define MAX_RT_PRIO MAX_USER_RT_PRIO
6528 +#define MAX_PRIO (MAX_RT_PRIO + 40)
6531 extern void set_cpus_allowed(struct task_struct *p, unsigned long new_mask);
6533 extern struct user_struct root_user;
6534 #define INIT_USER (&root_user)
6536 +typedef struct prio_array prio_array_t;
6538 struct task_struct {
6540 * offsets of these are hardcoded elsewhere - touch with care
6541 @@ -297,35 +322,26 @@
6543 int lock_depth; /* Lock depth */
6546 - * offset 32 begins here on 32-bit platforms. We keep
6547 - * all fields in a single cacheline that are needed for
6548 - * the goodness() loop in schedule().
6552 - unsigned long policy;
6553 - struct mm_struct *mm;
6556 - * cpus_runnable is ~0 if the process is not running on any
6557 - * CPU. It's (1 << cpu) if it's running on a CPU. This mask
6558 - * is updated under the runqueue lock.
6560 - * To determine whether a process might run on a CPU, this
6561 - * mask is AND-ed with cpus_allowed.
6563 - unsigned long cpus_runnable, cpus_allowed;
6565 - * (only the 'next' pointer fits into the cacheline, but
6566 - * that's just fine.)
6567 + * offset 32 begins here on 32-bit platforms.
6570 + int prio, static_prio;
6571 struct list_head run_list;
6572 - unsigned long sleep_time;
6573 + prio_array_t *array;
6575 - struct task_struct *next_task, *prev_task;
6576 - struct mm_struct *active_mm;
6577 + unsigned long sleep_avg;
6578 + unsigned long sleep_timestamp;
6580 + unsigned long policy;
6581 + unsigned long cpus_allowed;
6582 + unsigned int time_slice, first_time_slice;
6584 + task_t *next_task, *prev_task;
6586 + struct mm_struct *mm, *active_mm;
6587 struct list_head local_pages;
6589 unsigned int allocation_order, nr_local_pages;
6592 @@ -348,12 +364,12 @@
6593 * older sibling, respectively. (p->father can be replaced with
6596 - struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
6597 + task_t *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
6598 struct list_head thread_group;
6600 /* PID hash table linkage. */
6601 - struct task_struct *pidhash_next;
6602 - struct task_struct **pidhash_pprev;
6603 + task_t *pidhash_next;
6604 + task_t **pidhash_pprev;
6606 wait_queue_head_t wait_chldexit; /* for wait4() */
6607 struct completion *vfork_done; /* for vfork() */
6610 /* Protection of (de-)allocation: mm, files, fs, tty */
6611 spinlock_t alloc_lock;
6612 +/* context-switch lock */
6613 + spinlock_t switch_lock;
6615 /* journalling filesystem info */
6617 @@ -452,9 +470,15 @@
6619 #define _STK_LIM (8*1024*1024)
6621 -#define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */
6622 -#define MAX_COUNTER (20*HZ/100)
6623 -#define DEF_NICE (0)
6625 +extern void set_cpus_allowed(task_t *p, unsigned long new_mask);
6627 +#define set_cpus_allowed(p, new_mask) do { } while (0)
6630 +extern void set_user_nice(task_t *p, long nice);
6631 +extern int task_prio(task_t *p);
6632 +extern int task_nice(task_t *p);
6634 extern void yield(void);
6636 @@ -475,14 +499,14 @@
6637 addr_limit: KERNEL_DS, \
6638 exec_domain: &default_exec_domain, \
6640 - counter: DEF_COUNTER, \
6642 + prio: MAX_PRIO-20, \
6643 + static_prio: MAX_PRIO-20, \
6644 policy: SCHED_OTHER, \
6645 + cpus_allowed: ~0UL, \
6647 active_mm: &init_mm, \
6648 - cpus_runnable: ~0UL, \
6649 - cpus_allowed: ~0UL, \
6650 run_list: LIST_HEAD_INIT(tsk.run_list), \
6656 pending: { NULL, &tsk.pending.head, {{0}}}, \
6658 alloc_lock: SPIN_LOCK_UNLOCKED, \
6659 + switch_lock: SPIN_LOCK_UNLOCKED, \
6660 journal_info: NULL, \
6663 @@ -516,24 +541,23 @@
6667 - struct task_struct task;
6669 unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
6672 extern union task_union init_task_union;
6674 extern struct mm_struct init_mm;
6675 -extern struct task_struct *init_tasks[NR_CPUS];
6677 /* PID hashing. (shouldnt this be dynamic?) */
6678 #define PIDHASH_SZ (4096 >> 2)
6679 -extern struct task_struct *pidhash[PIDHASH_SZ];
6680 +extern task_t *pidhash[PIDHASH_SZ];
6682 #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
6684 -static inline void hash_pid(struct task_struct *p)
6685 +static inline void hash_pid(task_t *p)
6687 - struct task_struct **htable = &pidhash[pid_hashfn(p->pid)];
6688 + task_t **htable = &pidhash[pid_hashfn(p->pid)];
6690 if((p->pidhash_next = *htable) != NULL)
6691 (*htable)->pidhash_pprev = &p->pidhash_next;
6692 @@ -541,16 +565,16 @@
6693 p->pidhash_pprev = htable;
6696 -static inline void unhash_pid(struct task_struct *p)
6697 +static inline void unhash_pid(task_t *p)
6700 p->pidhash_next->pidhash_pprev = p->pidhash_pprev;
6701 *p->pidhash_pprev = p->pidhash_next;
6704 -static inline struct task_struct *find_task_by_pid(int pid)
6705 +static inline task_t *find_task_by_pid(int pid)
6707 - struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)];
6708 + task_t *p, **htable = &pidhash[pid_hashfn(pid)];
6710 for(p = *htable; p && p->pid != pid; p = p->pidhash_next)
6712 @@ -558,19 +582,6 @@
6716 -#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL)
6718 -static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu)
6720 - tsk->processor = cpu;
6721 - tsk->cpus_runnable = 1UL << cpu;
6724 -static inline void task_release_cpu(struct task_struct *tsk)
6726 - tsk->cpus_runnable = ~0UL;
6729 /* per-UID process charging. */
6730 extern struct user_struct * alloc_uid(uid_t);
6731 extern void free_uid(struct user_struct *);
6732 @@ -598,47 +609,50 @@
6733 extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
6734 extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
6735 signed long timeout));
6736 -extern int FASTCALL(wake_up_process(struct task_struct * tsk));
6737 +extern int FASTCALL(wake_up_process(task_t * p));
6738 +extern void FASTCALL(wake_up_forked_process(task_t * p));
6740 #define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
6741 #define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
6742 #define wake_up_all(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0)
6743 -#define wake_up_sync(x) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
6744 -#define wake_up_sync_nr(x, nr) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
6745 #define wake_up_interruptible(x) __wake_up((x),TASK_INTERRUPTIBLE, 1)
6746 #define wake_up_interruptible_nr(x, nr) __wake_up((x),TASK_INTERRUPTIBLE, nr)
6747 #define wake_up_interruptible_all(x) __wake_up((x),TASK_INTERRUPTIBLE, 0)
6748 -#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1)
6749 -#define wake_up_interruptible_sync_nr(x, nr) __wake_up_sync((x),TASK_INTERRUPTIBLE, nr)
6751 +#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1)
6753 +#define wake_up_interruptible_sync(x) __wake_up((x),TASK_INTERRUPTIBLE, 1)
6756 asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru);
6758 extern int in_group_p(gid_t);
6759 extern int in_egroup_p(gid_t);
6761 extern void proc_caches_init(void);
6762 -extern void flush_signals(struct task_struct *);
6763 -extern void flush_signal_handlers(struct task_struct *);
6764 +extern void flush_signals(task_t *);
6765 +extern void flush_signal_handlers(task_t *);
6766 extern void sig_exit(int, int, struct siginfo *);
6767 extern int dequeue_signal(sigset_t *, siginfo_t *);
6768 extern void block_all_signals(int (*notifier)(void *priv), void *priv,
6770 extern void unblock_all_signals(void);
6771 -extern int send_sig_info(int, struct siginfo *, struct task_struct *);
6772 -extern int force_sig_info(int, struct siginfo *, struct task_struct *);
6773 +extern int send_sig_info(int, struct siginfo *, task_t *);
6774 +extern int force_sig_info(int, struct siginfo *, task_t *);
6775 extern int kill_pg_info(int, struct siginfo *, pid_t);
6776 extern int kill_sl_info(int, struct siginfo *, pid_t);
6777 extern int kill_proc_info(int, struct siginfo *, pid_t);
6778 -extern void notify_parent(struct task_struct *, int);
6779 -extern void do_notify_parent(struct task_struct *, int);
6780 -extern void force_sig(int, struct task_struct *);
6781 -extern int send_sig(int, struct task_struct *, int);
6782 +extern void notify_parent(task_t *, int);
6783 +extern void do_notify_parent(task_t *, int);
6784 +extern void force_sig(int, task_t *);
6785 +extern int send_sig(int, task_t *, int);
6786 extern int kill_pg(pid_t, int, int);
6787 extern int kill_sl(pid_t, int, int);
6788 extern int kill_proc(pid_t, int, int);
6789 extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *);
6790 extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long);
6792 -static inline int signal_pending(struct task_struct *p)
6793 +static inline int signal_pending(task_t *p)
6795 return (p->sigpending != 0);
6798 This is required every time the blocked sigset_t changes.
6799 All callers should have t->sigmask_lock. */
6801 -static inline void recalc_sigpending(struct task_struct *t)
6802 +static inline void recalc_sigpending(task_t *t)
6804 t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
6806 @@ -784,16 +798,17 @@
6807 extern int expand_fdset(struct files_struct *, int nr);
6808 extern void free_fdset(fd_set *, int);
6810 -extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
6811 +extern int copy_thread(int, unsigned long, unsigned long, unsigned long, task_t *, struct pt_regs *);
6812 extern void flush_thread(void);
6813 extern void exit_thread(void);
6815 -extern void exit_mm(struct task_struct *);
6816 -extern void exit_files(struct task_struct *);
6817 -extern void exit_sighand(struct task_struct *);
6818 +extern void exit_mm(task_t *);
6819 +extern void exit_files(task_t *);
6820 +extern void exit_sighand(task_t *);
6822 extern void reparent_to_init(void);
6823 extern void daemonize(void);
6824 +extern task_t *child_reaper;
6826 extern int do_execve(char *, char **, char **, struct pt_regs *);
6827 extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long);
6830 extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
6832 +extern void wait_task_inactive(task_t * p);
6833 +extern void kick_if_running(task_t * p);
6835 #define __wait_event(wq, condition) \
6837 wait_queue_t __wait; \
6838 @@ -885,27 +903,12 @@
6839 for (task = next_thread(current) ; task != current ; task = next_thread(task))
6841 #define next_thread(p) \
6842 - list_entry((p)->thread_group.next, struct task_struct, thread_group)
6843 + list_entry((p)->thread_group.next, task_t, thread_group)
6845 #define thread_group_leader(p) (p->pid == p->tgid)
6847 -static inline void del_from_runqueue(struct task_struct * p)
6848 +static inline void unhash_process(task_t *p)
6851 - p->sleep_time = jiffies;
6852 - list_del(&p->run_list);
6853 - p->run_list.next = NULL;
6856 -static inline int task_on_runqueue(struct task_struct *p)
6858 - return (p->run_list.next != NULL);
6861 -static inline void unhash_process(struct task_struct *p)
6863 - if (task_on_runqueue(p))
6864 - out_of_line_bug();
6865 write_lock_irq(&tasklist_lock);
6868 @@ -915,12 +918,12 @@
6871 /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */
6872 -static inline void task_lock(struct task_struct *p)
6873 +static inline void task_lock(task_t *p)
6875 spin_lock(&p->alloc_lock);
6878 -static inline void task_unlock(struct task_struct *p)
6879 +static inline void task_unlock(task_t *p)
6881 spin_unlock(&p->alloc_lock);
6883 @@ -944,6 +947,26 @@
6887 +static inline void set_need_resched(void)
6889 + current->need_resched = 1;
6892 +static inline void clear_need_resched(void)
6894 + current->need_resched = 0;
6897 +static inline void set_tsk_need_resched(task_t *tsk)
6899 + tsk->need_resched = 1;
6902 +static inline void clear_tsk_need_resched(task_t *tsk)
6904 + tsk->need_resched = 0;
6907 static inline int need_resched(void)
6909 return (unlikely(current->need_resched));
6913 #endif /* __KERNEL__ */
6916 diff -urN linux-2.4.22.org/include/linux/smp_balance.h linux-2.4.22/include/linux/smp_balance.h
6917 --- linux-2.4.22.org/include/linux/smp_balance.h 1970-01-01 01:00:00.000000000 +0100
6918 +++ linux-2.4.22/include/linux/smp_balance.h 2003-11-24 18:39:03.000000000 +0100
6920 +#ifndef _LINUX_SMP_BALANCE_H
6921 +#define _LINUX_SMP_BALANCE_H
6924 + * per-architecture load balancing logic, e.g. for hyperthreading
6927 +#ifdef ARCH_HAS_SMP_BALANCE
6928 +#include <asm/smp_balance.h>
6930 +#define arch_load_balance(x, y) (0)
6931 +#define arch_reschedule_idle_override(x, idle) (idle)
6934 +#endif /* _LINUX_SMP_BALANCE_H */
6935 diff -urN linux-2.4.22.org/include/linux/smp.h linux-2.4.22/include/linux/smp.h
6936 --- linux-2.4.22.org/include/linux/smp.h 2003-11-24 18:28:22.000000000 +0100
6937 +++ linux-2.4.22/include/linux/smp.h 2003-11-24 18:39:03.000000000 +0100
6939 #define cpu_number_map(cpu) 0
6940 #define smp_call_function(func,info,retry,wait) ({ 0; })
6941 #define cpu_online_map 1
6942 +static inline void smp_send_reschedule(int cpu) { }
6943 +static inline void smp_send_reschedule_all(void) { }
6948 + * Common definitions:
6950 +#define cpu() smp_processor_id()
6953 diff -urN linux-2.4.22.org/include/linux/wait.h linux-2.4.22/include/linux/wait.h
6954 --- linux-2.4.22.org/include/linux/wait.h 2003-11-24 18:28:20.000000000 +0100
6955 +++ linux-2.4.22/include/linux/wait.h 2003-11-24 18:39:03.000000000 +0100
6957 # define wq_write_lock_irq write_lock_irq
6958 # define wq_write_lock_irqsave write_lock_irqsave
6959 # define wq_write_unlock_irqrestore write_unlock_irqrestore
6960 +# define wq_write_unlock_irq write_unlock_irq
6961 # define wq_write_unlock write_unlock
6963 # define wq_lock_t spinlock_t
6965 # define wq_write_lock_irq spin_lock_irq
6966 # define wq_write_lock_irqsave spin_lock_irqsave
6967 # define wq_write_unlock_irqrestore spin_unlock_irqrestore
6968 +# define wq_write_unlock_irq spin_unlock_irq
6969 # define wq_write_unlock spin_unlock
6972 diff -urN linux-2.4.22.org/init/main.c linux-2.4.22/init/main.c
6973 --- linux-2.4.22.org/init/main.c 2003-11-24 18:28:15.000000000 +0100
6974 +++ linux-2.4.22/init/main.c 2003-11-24 18:39:03.000000000 +0100
6976 extern void setup_arch(char **);
6977 extern void cpu_idle(void);
6979 -unsigned long wait_init_idle;
6983 #ifdef CONFIG_X86_LOCAL_APIC
6984 @@ -303,34 +301,24 @@
6985 APIC_init_uniprocessor();
6988 -#define smp_init() do { } while (0)
6989 +#define smp_init() do { } while (0)
6995 /* Called by boot processor to activate the rest. */
6996 static void __init smp_init(void)
6998 /* Get other processors into their bootup holding patterns. */
7000 - wait_init_idle = cpu_online_map;
7001 - clear_bit(current->processor, &wait_init_idle); /* Don't wait on me! */
7003 smp_threads_ready=1;
7006 - /* Wait for the other cpus to set up their idle processes */
7007 - printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle);
7008 - while (wait_init_idle) {
7012 - printk("All processors have done init_idle\n");
7019 * We need to finalize in a non-__init function or else race conditions
7020 * between the root thread and the init thread may cause start_kernel to
7023 kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
7025 - current->need_resched = 1;
7032 * Activate the first processor.
7035 printk("POSIX conformance testing by UNIFIX\n");
7037 + init_idle(current, smp_processor_id());
7039 * We count on the initial thread going ok
7040 * Like idlers init is an unlocked kernel thread, which will
7041 @@ -465,6 +453,10 @@
7043 static void __init do_basic_setup(void)
7045 + /* Start the per-CPU migration threads */
7051 * Tell the world that we're going to be the grim
7052 diff -urN linux-2.4.22.org/kernel/capability.c linux-2.4.22/kernel/capability.c
7053 --- linux-2.4.22.org/kernel/capability.c 2003-11-24 18:28:16.000000000 +0100
7054 +++ linux-2.4.22/kernel/capability.c 2003-11-24 18:39:03.000000000 +0100
7056 #include <linux/mm.h>
7057 #include <asm/uaccess.h>
7059 +unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
7061 kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
7063 /* Note: never hold tasklist_lock while spinning for this one */
7064 diff -urN linux-2.4.22.org/kernel/exit.c linux-2.4.22/kernel/exit.c
7065 --- linux-2.4.22.org/kernel/exit.c 2003-11-24 18:28:15.000000000 +0100
7066 +++ linux-2.4.22/kernel/exit.c 2003-11-24 18:39:03.000000000 +0100
7069 static void release_task(struct task_struct * p)
7071 - if (p != current) {
7076 - * Wait to make sure the process isn't on the
7077 - * runqueue (active on some other CPU still)
7081 - if (!task_has_cpu(p))
7087 - } while (task_has_cpu(p));
7090 + wait_task_inactive(p);
7092 - atomic_dec(&p->user->processes);
7093 - free_uid(p->user);
7094 - unhash_process(p);
7096 - release_thread(p);
7097 - current->cmin_flt += p->min_flt + p->cmin_flt;
7098 - current->cmaj_flt += p->maj_flt + p->cmaj_flt;
7099 - current->cnswap += p->nswap + p->cnswap;
7101 - * Potentially available timeslices are retrieved
7102 - * here - this way the parent does not get penalized
7103 - * for creating too many processes.
7105 - * (this cannot be used to artificially 'generate'
7106 - * timeslices, because any timeslice recovered here
7107 - * was given away by the parent in the first place.)
7109 - current->counter += p->counter;
7110 - if (current->counter >= MAX_COUNTER)
7111 - current->counter = MAX_COUNTER;
7113 - free_task_struct(p);
7115 - printk("task releasing itself\n");
7117 + atomic_dec(&p->user->processes);
7118 + free_uid(p->user);
7119 + unhash_process(p);
7121 + release_thread(p);
7122 + current->cmin_flt += p->min_flt + p->cmin_flt;
7123 + current->cmaj_flt += p->maj_flt + p->cmaj_flt;
7124 + current->cnswap += p->nswap + p->cnswap;
7127 + free_task_struct(p);
7131 @@ -150,6 +123,79 @@
7136 + * reparent_to_init() - Reparent the calling kernel thread to the init task.
7138 + * If a kernel thread is launched as a result of a system call, or if
7139 + * it ever exits, it should generally reparent itself to init so that
7140 + * it is correctly cleaned up on exit.
7142 + * The various task state such as scheduling policy and priority may have
7143 + * been inherited from a user process, so we reset them to sane values here.
7145 + * NOTE that reparent_to_init() gives the caller full capabilities.
7147 +void reparent_to_init(void)
7149 + write_lock_irq(&tasklist_lock);
7151 + /* Reparent to init */
7152 + REMOVE_LINKS(current);
7153 + current->p_pptr = child_reaper;
7154 + current->p_opptr = child_reaper;
7155 + SET_LINKS(current);
7157 + /* Set the exit signal to SIGCHLD so we signal init on exit */
7158 + current->exit_signal = SIGCHLD;
7160 + current->ptrace = 0;
7161 + if ((current->policy == SCHED_OTHER) && (task_nice(current) < 0))
7162 + set_user_nice(current, 0);
7163 + /* cpus_allowed? */
7164 + /* rt_priority? */
7166 + current->cap_effective = CAP_INIT_EFF_SET;
7167 + current->cap_inheritable = CAP_INIT_INH_SET;
7168 + current->cap_permitted = CAP_FULL_SET;
7169 + current->keep_capabilities = 0;
7170 + memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
7171 + current->user = INIT_USER;
7173 + write_unlock_irq(&tasklist_lock);
7177 + * Put all the gunge required to become a kernel thread without
7178 + * attached user resources in one place where it belongs.
7181 +void daemonize(void)
7183 + struct fs_struct *fs;
7187 + * If we were started as result of loading a module, close all of the
7188 + * user space pages. We don't need them, and if we didn't close them
7189 + * they would be locked into memory.
7193 + current->session = 1;
7194 + current->pgrp = 1;
7195 + current->tty = NULL;
7197 + /* Become as one with the init task */
7199 + exit_fs(current); /* current->fs->count--; */
7200 + fs = init_task.fs;
7202 + atomic_inc(&fs->count);
7203 + exit_files(current);
7204 + current->files = init_task.files;
7205 + atomic_inc(¤t->files->count);
7209 * When we die, we re-parent all our children.
7210 * Try to give them to another thread in our thread
7212 /* Make sure we're not reparenting to ourselves */
7213 p->p_opptr = child_reaper;
7215 + p->first_time_slice = 0;
7216 if (p->pdeath_signal) send_sig(p->pdeath_signal, p, 0);
7219 diff -urN linux-2.4.22.org/kernel/fork.c linux-2.4.22/kernel/fork.c
7220 --- linux-2.4.22.org/kernel/fork.c 2003-11-24 18:28:15.000000000 +0100
7221 +++ linux-2.4.22/kernel/fork.c 2003-11-24 18:39:03.000000000 +0100
7224 /* The idle threads do not count.. */
7229 unsigned long total_forks; /* Handle normal Linux uptimes. */
7232 struct task_struct *pidhash[PIDHASH_SZ];
7234 +rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */
7236 void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
7238 unsigned long flags;
7240 if (p->pid == 0 && current->pid != 0)
7241 goto bad_fork_cleanup;
7243 - p->run_list.next = NULL;
7244 - p->run_list.prev = NULL;
7247 init_waitqueue_head(&p->wait_chldexit);
7248 p->vfork_done = NULL;
7250 init_completion(&vfork);
7252 spin_lock_init(&p->alloc_lock);
7253 + spin_lock_init(&p->switch_lock);
7256 init_sigpending(&p->pending);
7257 @@ -724,11 +723,11 @@
7261 - p->cpus_runnable = ~0UL;
7262 - p->processor = current->processor;
7264 /* ?? should we just memset this ?? */
7265 for(i = 0; i < smp_num_cpus; i++)
7266 - p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
7267 + p->per_cpu_utime[cpu_logical_map(i)] =
7268 + p->per_cpu_stime[cpu_logical_map(i)] = 0;
7269 spin_lock_init(&p->sigmask_lock);
7272 @@ -766,15 +765,27 @@
7273 p->pdeath_signal = 0;
7276 - * "share" dynamic priority between parent and child, thus the
7277 - * total amount of dynamic priorities in the system doesn't change,
7278 - * more scheduling fairness. This is only important in the first
7279 - * timeslice, on the long run the scheduling behaviour is unchanged.
7281 - p->counter = (current->counter + 1) >> 1;
7282 - current->counter >>= 1;
7283 - if (!current->counter)
7284 - current->need_resched = 1;
7285 + * Share the timeslice between parent and child, thus the
7286 + * total amount of pending timeslices in the system doesnt change,
7287 + * resulting in more scheduling fairness.
7290 + if (!current->time_slice)
7292 + p->time_slice = (current->time_slice + 1) >> 1;
7293 + current->time_slice >>= 1;
7294 + p->first_time_slice = 1;
7295 + if (!current->time_slice) {
7297 + * This case is rare, it happens when the parent has only
7298 + * a single jiffy left from its timeslice. Taking the
7299 + * runqueue lock is not a problem.
7301 + current->time_slice = 1;
7302 + scheduler_tick(0,0);
7304 + p->sleep_timestamp = jiffies;
7308 * Ok, add it to the run-queues and make it
7309 @@ -810,11 +821,16 @@
7311 if (p->ptrace & PT_PTRACED)
7312 send_sig(SIGSTOP, p, 1);
7314 - wake_up_process(p); /* do this last */
7315 + wake_up_forked_process(p); /* do this last */
7317 if (clone_flags & CLONE_VFORK)
7318 wait_for_completion(&vfork);
7321 + * Let the child process run first, to avoid most of the
7322 + * COW overhead when the child exec()s afterwards.
7324 + current->need_resched = 1;
7328 diff -urN linux-2.4.22.org/kernel/ksyms.c linux-2.4.22/kernel/ksyms.c
7329 --- linux-2.4.22.org/kernel/ksyms.c 2003-11-24 18:28:15.000000000 +0100
7330 +++ linux-2.4.22/kernel/ksyms.c 2003-11-24 18:39:03.000000000 +0100
7332 /* process management */
7333 EXPORT_SYMBOL(complete_and_exit);
7334 EXPORT_SYMBOL(__wake_up);
7335 -EXPORT_SYMBOL(__wake_up_sync);
7336 EXPORT_SYMBOL(wake_up_process);
7337 EXPORT_SYMBOL(sleep_on);
7338 EXPORT_SYMBOL(sleep_on_timeout);
7341 EXPORT_SYMBOL(yield);
7342 EXPORT_SYMBOL(__cond_resched);
7343 +EXPORT_SYMBOL(set_user_nice);
7344 +EXPORT_SYMBOL(nr_context_switches);
7345 EXPORT_SYMBOL(jiffies);
7346 EXPORT_SYMBOL(xtime);
7347 EXPORT_SYMBOL(do_gettimeofday);
7351 EXPORT_SYMBOL(kstat);
7352 -EXPORT_SYMBOL(nr_running);
7355 EXPORT_SYMBOL(panic);
7356 diff -urN linux-2.4.22.org/kernel/printk.c linux-2.4.22/kernel/printk.c
7357 --- linux-2.4.22.org/kernel/printk.c 2003-11-24 18:28:15.000000000 +0100
7358 +++ linux-2.4.22/kernel/printk.c 2003-11-24 18:39:03.000000000 +0100
7360 #include <linux/module.h>
7361 #include <linux/interrupt.h> /* For in_interrupt() */
7362 #include <linux/config.h>
7363 +#include <linux/delay.h>
7365 #include <asm/uaccess.h>
7367 diff -urN linux-2.4.22.org/kernel/ptrace.c linux-2.4.22/kernel/ptrace.c
7368 --- linux-2.4.22.org/kernel/ptrace.c 2003-11-24 18:28:15.000000000 +0100
7369 +++ linux-2.4.22/kernel/ptrace.c 2003-11-24 18:39:03.000000000 +0100
7371 if (child->state != TASK_STOPPED)
7374 - /* Make sure the child gets off its CPU.. */
7377 - if (!task_has_cpu(child))
7379 - task_unlock(child);
7381 - if (child->state != TASK_STOPPED)
7385 - } while (task_has_cpu(child));
7387 - task_unlock(child);
7388 + wait_task_inactive(child);
7392 diff -urN linux-2.4.22.org/kernel/sched.c linux-2.4.22/kernel/sched.c
7393 --- linux-2.4.22.org/kernel/sched.c 2003-11-24 18:28:15.000000000 +0100
7394 +++ linux-2.4.22/kernel/sched.c 2003-11-24 18:39:03.000000000 +0100
7397 * Kernel scheduler and related syscalls
7399 - * Copyright (C) 1991, 1992 Linus Torvalds
7400 + * Copyright (C) 1991-2002 Linus Torvalds
7402 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
7403 * make semaphores SMP safe
7404 * 1998-11-19 Implemented schedule_timeout() and related stuff
7405 * by Andrea Arcangeli
7406 - * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar
7407 + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
7408 + * hybrid priority-list and round-robin design with
7409 + * an array-switch method of distributing timeslices
7410 + * and per-CPU runqueues. Additional code by Davide
7411 + * Libenzi, Robert Love, and Rusty Russell.
7415 - * 'sched.c' is the main kernel file. It contains scheduling primitives
7416 - * (sleep_on, wakeup, schedule etc) as well as a number of simple system
7417 - * call functions (type getpid()), which just extract a field from
7421 -#include <linux/config.h>
7422 #include <linux/mm.h>
7423 -#include <linux/init.h>
7424 -#include <linux/smp_lock.h>
7425 #include <linux/nmi.h>
7426 #include <linux/interrupt.h>
7427 -#include <linux/kernel_stat.h>
7428 -#include <linux/completion.h>
7429 -#include <linux/prefetch.h>
7430 -#include <linux/compiler.h>
7432 +#include <linux/init.h>
7433 #include <asm/uaccess.h>
7434 +#include <linux/smp_lock.h>
7435 #include <asm/mmu_context.h>
7437 -extern void timer_bh(void);
7438 -extern void tqueue_bh(void);
7439 -extern void immediate_bh(void);
7440 +#include <linux/kernel_stat.h>
7441 +#include <linux/completion.h>
7444 - * scheduler variables
7446 + * Convert user-nice values [ -20 ... 0 ... 19 ]
7447 + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
7450 +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
7451 +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
7452 +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
7454 -unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
7456 -extern void mem_use(void);
7458 + * 'User priority' is the nice value converted to something we
7459 + * can work with better when scaling various scheduler parameters,
7460 + * it's a [ 0 ... 39 ] range.
7462 +#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
7463 +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
7464 +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
7467 - * Scheduling quanta.
7468 + * These are the 'tuning knobs' of the scheduler:
7470 - * NOTE! The unix "nice" value influences how long a process
7471 - * gets. The nice value ranges from -20 to +19, where a -20
7472 - * is a "high-priority" task, and a "+10" is a low-priority
7475 - * We want the time-slice to be around 50ms or so, so this
7476 - * calculation depends on the value of HZ.
7479 -#define TICK_SCALE(x) ((x) >> 2)
7481 -#define TICK_SCALE(x) ((x) >> 1)
7483 -#define TICK_SCALE(x) (x)
7485 -#define TICK_SCALE(x) ((x) << 1)
7487 -#define TICK_SCALE(x) ((x) << 2)
7490 -#define NICE_TO_TICKS(nice) (TICK_SCALE(20-(nice))+1)
7492 + * Minimum timeslice is 10 msecs, default timeslice is 150 msecs,
7493 + * maximum timeslice is 300 msecs. Timeslices get refilled after
7496 +#define MIN_TIMESLICE ( 10 * HZ / 1000)
7497 +#define MAX_TIMESLICE (300 * HZ / 1000)
7498 +#define CHILD_PENALTY 50
7499 +#define PARENT_PENALTY 100
7500 +#define PRIO_BONUS_RATIO 25
7501 +#define INTERACTIVE_DELTA 2
7502 +#define MAX_SLEEP_AVG (2*HZ)
7503 +#define STARVATION_LIMIT (2*HZ)
7506 - * Init task must be ok at boot for the ix86 as we will check its signals
7507 - * via the SMP irq return path.
7510 -struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
7511 + * If a task is 'interactive' then we reinsert it in the active
7512 + * array after it has expired its current timeslice. (it will not
7513 + * continue to run immediately, it will still roundrobin with
7514 + * other interactive tasks.)
7516 + * This part scales the interactivity limit depending on niceness.
7518 + * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
7519 + * Here are a few examples of different nice levels:
7521 + * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
7522 + * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
7523 + * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
7524 + * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
7525 + * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
7527 + * (the X axis represents the possible -5 ... 0 ... +5 dynamic
7528 + * priority range a task can explore, a value of '1' means the
7529 + * task is rated interactive.)
7531 + * Ie. nice +19 tasks can never get 'interactive' enough to be
7532 + * reinserted into the active array. And only heavily CPU-hog nice -20
7533 + * tasks will be expired. Default nice 0 tasks are somewhere between,
7534 + * it takes some effort for them to get interactive, but it's not
7538 +#define SCALE(v1,v1_max,v2_max) \
7539 + (v1) * (v2_max) / (v1_max)
7542 + (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
7543 + INTERACTIVE_DELTA)
7545 +#define TASK_INTERACTIVE(p) \
7546 + ((p)->prio <= (p)->static_prio - DELTA(p))
7549 - * The tasklist_lock protects the linked list of processes.
7551 - * The runqueue_lock locks the parts that actually access
7552 - * and change the run-queues, and have to be interrupt-safe.
7554 - * If both locks are to be concurrently held, the runqueue_lock
7555 - * nests inside the tasklist_lock.
7556 + * TASK_TIMESLICE scales user-nice values [ -20 ... 19 ]
7557 + * to time slice values.
7559 - * task->alloc_lock nests inside tasklist_lock.
7560 + * The higher a process's priority, the bigger timeslices
7561 + * it gets during one round of execution. But even the lowest
7562 + * priority process gets MIN_TIMESLICE worth of execution time.
7564 -spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */
7565 -rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */
7567 -static LIST_HEAD(runqueue_head);
7568 +#define TASK_TIMESLICE(p) (MIN_TIMESLICE + \
7569 + ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/39))
7572 - * We align per-CPU scheduling data on cacheline boundaries,
7573 - * to prevent cacheline ping-pong.
7574 + * These are the runqueue data structures:
7577 - struct schedule_data {
7578 - struct task_struct * curr;
7579 - cycles_t last_schedule;
7581 - char __pad [SMP_CACHE_BYTES];
7582 -} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
7584 -#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
7585 -#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
7586 +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
7588 -struct kernel_stat kstat;
7589 -extern struct task_struct *child_reaper;
7590 +typedef struct runqueue runqueue_t;
7593 +struct prio_array {
7595 + unsigned long bitmap[BITMAP_SIZE];
7596 + struct list_head queue[MAX_PRIO];
7599 -#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
7600 -#define can_schedule(p,cpu) \
7601 - ((p)->cpus_runnable & (p)->cpus_allowed & (1UL << cpu))
7603 + * This is the main, per-CPU runqueue data structure.
7605 + * Locking rule: those places that want to lock multiple runqueues
7606 + * (such as the load balancing or the process migration code), lock
7607 + * acquire operations must be ordered by ascending &runqueue.
7611 + unsigned long nr_running, nr_switches, expired_timestamp;
7612 + task_t *curr, *idle;
7613 + prio_array_t *active, *expired, arrays[2];
7614 + long nr_uninterruptible;
7617 + int prev_nr_running[NR_CPUS];
7618 + task_t *migration_thread;
7619 + struct list_head migration_queue;
7621 +} ____cacheline_aligned;
7624 +static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
7626 -#define idle_task(cpu) (&init_task)
7627 -#define can_schedule(p,cpu) (1)
7628 +#define cpu_rq(cpu) (runqueues + (cpu))
7629 +#define this_rq() cpu_rq(smp_processor_id())
7630 +#define task_rq(p) cpu_rq((p)->cpu)
7631 +#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
7632 +#define rt_task(p) ((p)->prio < MAX_RT_PRIO)
7635 + * Default context-switch locking:
7637 +#ifndef prepare_arch_switch
7638 +# define prepare_arch_switch(rq, next) do { } while(0)
7639 +# define finish_arch_switch(rq, prev) spin_unlock_irq(&(rq)->lock)
7642 -void scheduling_functions_start_here(void) { }
7645 - * This is the function that decides how desirable a process is..
7646 - * You can weigh different processes against each other depending
7647 - * on what CPU they've run on lately etc to try to handle cache
7648 - * and TLB miss penalties.
7651 - * -1000: never select this
7652 - * 0: out of time, recalculate counters (but it might still be
7654 - * +ve: "goodness" value (the larger, the better)
7655 - * +1000: realtime process, select this.
7656 + * task_rq_lock - lock the runqueue a given task resides on and disable
7657 + * interrupts. Note the ordering: we can safely lookup the task_rq without
7658 + * explicitly disabling preemption.
7661 -static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
7662 +static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
7667 - * select the current process after every other
7668 - * runnable process, but before the idle thread.
7669 - * Also, dont trigger a counter recalculation.
7672 - if (p->policy & SCHED_YIELD)
7674 + struct runqueue *rq;
7677 - * Non-RT process - normal case first.
7679 - if (p->policy == SCHED_OTHER) {
7681 - * Give the process a first-approximation goodness value
7682 - * according to the number of clock-ticks it has left.
7684 - * Don't do any other calculations if the time slice is
7687 - weight = p->counter;
7692 - /* Give a largish advantage to the same processor... */
7693 - /* (this is equivalent to penalizing other processors) */
7694 - if (p->processor == this_cpu)
7695 - weight += PROC_CHANGE_PENALTY;
7698 - /* .. and a slight advantage to the current MM */
7699 - if (p->mm == this_mm || !p->mm)
7701 - weight += 20 - p->nice;
7705 + spin_lock_irqsave(&rq->lock, *flags);
7706 + if (unlikely(rq != task_rq(p))) {
7707 + spin_unlock_irqrestore(&rq->lock, *flags);
7708 + goto repeat_lock_task;
7714 - * Realtime process, select the first one on the
7715 - * runqueue (taking priorities within processes
7718 - weight = 1000 + p->rt_priority;
7721 +static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
7723 + spin_unlock_irqrestore(&rq->lock, *flags);
7727 - * the 'goodness value' of replacing a process on a given CPU.
7728 - * positive value means 'replace', zero or negative means 'dont'.
7729 + * Adding/removing a task to/from a priority array:
7731 -static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
7732 +static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
7734 - return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
7735 + array->nr_active--;
7736 + list_del(&p->run_list);
7737 + if (list_empty(array->queue + p->prio))
7738 + __clear_bit(p->prio, array->bitmap);
7742 - * This is ugly, but reschedule_idle() is very timing-critical.
7743 - * We are called with the runqueue spinlock held and we must
7744 - * not claim the tasklist_lock.
7746 -static FASTCALL(void reschedule_idle(struct task_struct * p));
7747 +#define enqueue_task(p, array) __enqueue_task(p, array, NULL)
7748 +static inline void __enqueue_task(struct task_struct *p, prio_array_t *array, task_t * parent)
7751 + list_add_tail(&p->run_list, array->queue + p->prio);
7752 + __set_bit(p->prio, array->bitmap);
7755 + list_add_tail(&p->run_list, &parent->run_list);
7756 + array = p->array = parent->array;
7758 + array->nr_active++;
7761 -static void reschedule_idle(struct task_struct * p)
7762 +static inline int effective_prio(task_t *p)
7765 - int this_cpu = smp_processor_id();
7766 - struct task_struct *tsk, *target_tsk;
7767 - int cpu, best_cpu, i, max_prio;
7768 - cycles_t oldest_idle;
7772 - * shortcut if the woken up task's last CPU is
7774 + * Here we scale the actual sleep average [0 .... MAX_SLEEP_AVG]
7775 + * into the -5 ... 0 ... +5 bonus/penalty range.
7777 + * We use 25% of the full 0...39 priority range so that:
7779 + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
7780 + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
7782 + * Both properties are important to certain workloads.
7784 - best_cpu = p->processor;
7785 - if (can_schedule(p, best_cpu)) {
7786 - tsk = idle_task(best_cpu);
7787 - if (cpu_curr(best_cpu) == tsk) {
7791 - * If need_resched == -1 then we can skip sending
7792 - * the IPI altogether, tsk->need_resched is
7793 - * actively watched by the idle thread.
7795 - need_resched = tsk->need_resched;
7796 - tsk->need_resched = 1;
7797 - if ((best_cpu != this_cpu) && !need_resched)
7798 - smp_send_reschedule(best_cpu);
7802 + bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
7803 + MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
7806 - * We know that the preferred CPU has a cache-affine current
7807 - * process, lets try to find a new idle CPU for the woken-up
7808 - * process. Select the least recently active idle CPU. (that
7809 - * one will have the least active cache context.) Also find
7810 - * the executing process which has the least priority.
7812 - oldest_idle = (cycles_t) -1;
7813 - target_tsk = NULL;
7815 + prio = p->static_prio - bonus;
7816 + if (prio < MAX_RT_PRIO)
7817 + prio = MAX_RT_PRIO;
7818 + if (prio > MAX_PRIO-1)
7819 + prio = MAX_PRIO-1;
7823 - for (i = 0; i < smp_num_cpus; i++) {
7824 - cpu = cpu_logical_map(i);
7825 - if (!can_schedule(p, cpu))
7827 - tsk = cpu_curr(cpu);
7828 +#define activate_task(p, rq) __activate_task(p, rq, NULL)
7829 +static inline void __activate_task(task_t *p, runqueue_t *rq, task_t * parent)
7831 + unsigned long sleep_time = jiffies - p->sleep_timestamp;
7832 + prio_array_t *array = rq->active;
7834 + if (!parent && !rt_task(p) && sleep_time) {
7836 - * We use the first available idle CPU. This creates
7837 - * a priority list between idle CPUs, but this is not
7839 + * This code gives a bonus to interactive tasks. We update
7840 + * an 'average sleep time' value here, based on
7841 + * sleep_timestamp. The more time a task spends sleeping,
7842 + * the higher the average gets - and the higher the priority
7843 + * boost gets as well.
7845 - if (tsk == idle_task(cpu)) {
7846 -#if defined(__i386__) && defined(CONFIG_SMP)
7848 - * Check if two siblings are idle in the same
7849 - * physical package. Use them if found.
7851 - if (smp_num_siblings == 2) {
7852 - if (cpu_curr(cpu_sibling_map[cpu]) ==
7853 - idle_task(cpu_sibling_map[cpu])) {
7854 - oldest_idle = last_schedule(cpu);
7861 - if (last_schedule(cpu) < oldest_idle) {
7862 - oldest_idle = last_schedule(cpu);
7866 - if (oldest_idle == (cycles_t)-1) {
7867 - int prio = preemption_goodness(tsk, p, cpu);
7869 - if (prio > max_prio) {
7878 - if (oldest_idle != (cycles_t)-1) {
7879 - best_cpu = tsk->processor;
7880 - goto send_now_idle;
7882 - tsk->need_resched = 1;
7883 - if (tsk->processor != this_cpu)
7884 - smp_send_reschedule(tsk->processor);
7885 + p->sleep_timestamp = jiffies;
7886 + p->sleep_avg += sleep_time;
7887 + if (p->sleep_avg > MAX_SLEEP_AVG)
7888 + p->sleep_avg = MAX_SLEEP_AVG;
7889 + p->prio = effective_prio(p);
7893 + __enqueue_task(p, array, parent);
7898 - int this_cpu = smp_processor_id();
7899 - struct task_struct *tsk;
7900 +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
7903 + if (p->state == TASK_UNINTERRUPTIBLE)
7904 + rq->nr_uninterruptible++;
7905 + dequeue_task(p, p->array);
7909 +static inline void resched_task(task_t *p)
7914 - tsk = cpu_curr(this_cpu);
7915 - if (preemption_goodness(tsk, p, this_cpu) > 0)
7916 - tsk->need_resched = 1;
7917 + need_resched = p->need_resched;
7918 + set_tsk_need_resched(p);
7919 + if (!need_resched && (p->cpu != smp_processor_id()))
7920 + smp_send_reschedule(p->cpu);
7922 + set_tsk_need_resched(p);
7931 - * This has to add the process to the _end_ of the
7932 - * run-queue, not the beginning. The goodness value will
7933 - * determine whether this process will run next. This is
7934 - * important to get SCHED_FIFO and SCHED_RR right, where
7935 - * a process that is either pre-empted or its time slice
7936 - * has expired, should be moved to the tail of the run
7937 - * queue for its priority - Bhavesh Davda
7938 + * Wait for a process to unschedule. This is used by the exit() and
7941 -static inline void add_to_runqueue(struct task_struct * p)
7942 +void wait_task_inactive(task_t * p)
7944 - list_add_tail(&p->run_list, &runqueue_head);
7946 + unsigned long flags;
7951 + if (unlikely(rq->curr == p)) {
7956 + rq = task_rq_lock(p, &flags);
7957 + if (unlikely(rq->curr == p)) {
7958 + task_rq_unlock(rq, &flags);
7961 + task_rq_unlock(rq, &flags);
7964 -static inline void move_last_runqueue(struct task_struct * p)
7966 + * Kick the remote CPU if the task is running currently,
7967 + * this code is used by the signal code to signal tasks
7968 + * which are in user-mode as quickly as possible.
7970 + * (Note that we do this lockless - if the task does anything
7971 + * while the message is in flight then it will notice the
7972 + * sigpending condition anyway.)
7974 +void kick_if_running(task_t * p)
7976 - list_del(&p->run_list);
7977 - list_add_tail(&p->run_list, &runqueue_head);
7978 + if (p == task_rq(p)->curr && p->cpu != smp_processor_id())
7984 +static int FASTCALL(reschedule_idle(task_t * p));
7985 +static void FASTCALL(load_balance(runqueue_t *this_rq, int idle));
7990 * Wake up a process. Put it on the run-queue if it's not
7991 @@ -345,429 +338,721 @@
7992 * progress), and as such you're allowed to do the simpler
7993 * "current->state = TASK_RUNNING" to mark yourself runnable
7994 * without the overhead of this.
7996 + * returns failure only if the task is already active.
7998 -static inline int try_to_wake_up(struct task_struct * p, int synchronous)
7999 +static int try_to_wake_up(task_t * p, int sync)
8001 unsigned long flags;
8006 + int migrated_to_idle = 0;
8012 + rq = task_rq_lock(p, &flags);
8013 + old_state = p->state;
8016 + if (likely(rq->curr != p)) {
8018 + if (unlikely(sync)) {
8019 + if (p->cpu != smp_processor_id() &&
8020 + p->cpus_allowed & (1UL << smp_processor_id())) {
8021 + p->cpu = smp_processor_id();
8022 + goto migrated_task;
8025 + if (reschedule_idle(p))
8026 + goto migrated_task;
8030 + if (old_state == TASK_UNINTERRUPTIBLE)
8031 + rq->nr_uninterruptible--;
8032 + activate_task(p, rq);
8033 + if (p->prio < rq->curr->prio)
8034 + resched_task(rq->curr);
8037 + p->state = TASK_RUNNING;
8041 - * We want the common case fall through straight, thus the goto.
8042 + * Subtle: we can load_balance only here (before unlock)
8043 + * because it can internally drop the lock. Claim
8044 + * that the cpu is running so it will be a light rebalance,
8045 + * if this cpu will go idle soon schedule() will trigger the
8046 + * idle rescheduling balancing by itself.
8048 - spin_lock_irqsave(&runqueue_lock, flags);
8049 - p->state = TASK_RUNNING;
8050 - if (task_on_runqueue(p))
8052 - add_to_runqueue(p);
8053 - if (!synchronous || !(p->cpus_allowed & (1UL << smp_processor_id())))
8054 - reschedule_idle(p);
8057 - spin_unlock_irqrestore(&runqueue_lock, flags);
8058 + if (success && migrated_to_idle)
8059 + load_balance(rq, 0);
8062 + task_rq_unlock(rq, &flags);
8068 + task_rq_unlock(rq, &flags);
8069 + migrated_to_idle = 1;
8070 + goto repeat_lock_task;
8074 -inline int wake_up_process(struct task_struct * p)
8075 +int wake_up_process(task_t * p)
8077 return try_to_wake_up(p, 0);
8080 -static void process_timeout(unsigned long __data)
8081 +void wake_up_forked_process(task_t * p)
8083 - struct task_struct * p = (struct task_struct *) __data;
8085 + task_t * parent = current;
8087 - wake_up_process(p);
8090 + spin_lock_irq(&rq->lock);
8093 - * schedule_timeout - sleep until timeout
8094 - * @timeout: timeout value in jiffies
8096 - * Make the current task sleep until @timeout jiffies have
8097 - * elapsed. The routine will return immediately unless
8098 - * the current task state has been set (see set_current_state()).
8100 - * You can set the task state as follows -
8102 - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
8103 - * pass before the routine returns. The routine will return 0
8105 - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
8106 - * delivered to the current task. In this case the remaining time
8107 - * in jiffies will be returned, or 0 if the timer expired in time
8109 - * The current task state is guaranteed to be TASK_RUNNING when this
8110 - * routine returns.
8112 - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
8113 - * the CPU away without a bound on the timeout. In this case the return
8114 - * value will be %MAX_SCHEDULE_TIMEOUT.
8116 - * In all cases the return value is guaranteed to be non-negative.
8118 -signed long schedule_timeout(signed long timeout)
8120 - struct timer_list timer;
8121 - unsigned long expire;
8122 + p->state = TASK_RUNNING;
8123 + if (likely(!rt_task(p) && parent->array)) {
8125 + * We decrease the sleep average of forked
8126 + * children, to keep max-interactive tasks
8127 + * from forking tasks that are max-interactive.
8128 + * CHILD_PENALTY is set to 50% since we have
8129 + * no clue if this is still an interactive
8130 + * task like the parent or if this will be a
8131 + * cpu bound task. The parent isn't touched
8132 + * as we don't make assumption about the parent
8133 + * changing behaviour after the child is forked.
8135 + parent->sleep_avg = parent->sleep_avg * PARENT_PENALTY / 100;
8136 + p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
8140 - case MAX_SCHEDULE_TIMEOUT:
8142 - * These two special cases are useful to be comfortable
8143 - * in the caller. Nothing more. We could take
8144 - * MAX_SCHEDULE_TIMEOUT from one of the negative value
8145 - * but I' d like to return a valid offset (>=0) to allow
8146 - * the caller to do everything it want with the retval.
8147 + * For its first schedule keep the child at the same
8148 + * priority (i.e. in the same list) of the parent,
8149 + * activate_forked_task() will take care to put the
8150 + * child in front of the parent (lifo) to guarantee a
8151 + * schedule-child-first behaviour after fork.
8156 + p->prio = parent->prio;
8159 - * Another bit of PARANOID. Note that the retval will be
8160 - * 0 since no piece of kernel is supposed to do a check
8161 - * for a negative retval of schedule_timeout() (since it
8162 - * should never happens anyway). You just have the printk()
8163 - * that will tell you if something is gone wrong and where.
8164 + * Take the usual wakeup path if it's RT or if
8165 + * it's a child of the first idle task (during boot
8170 - printk(KERN_ERR "schedule_timeout: wrong timeout "
8171 - "value %lx from %p\n", timeout,
8172 - __builtin_return_address(0));
8173 - current->state = TASK_RUNNING;
8176 + p->prio = effective_prio(p);
8180 - expire = timeout + jiffies;
8181 + p->cpu = smp_processor_id();
8182 + __activate_task(p, rq, parent);
8183 + spin_unlock_irq(&rq->lock);
8186 - init_timer(&timer);
8187 - timer.expires = expire;
8188 - timer.data = (unsigned long) current;
8189 - timer.function = process_timeout;
8191 + * Potentially available exiting-child timeslices are
8192 + * retrieved here - this way the parent does not get
8193 + * penalized for creating too many processes.
8195 + * (this cannot be used to 'generate' timeslices
8196 + * artificially, because any timeslice recovered here
8197 + * was given away by the parent in the first place.)
8199 +void sched_exit(task_t * p)
8202 + if (p->first_time_slice) {
8203 + current->time_slice += p->time_slice;
8204 + if (unlikely(current->time_slice > MAX_TIMESLICE))
8205 + current->time_slice = MAX_TIMESLICE;
8210 - add_timer(&timer);
8212 - del_timer_sync(&timer);
8214 +asmlinkage void schedule_tail(task_t *prev)
8216 + finish_arch_switch(this_rq(), prev);
8220 +static inline task_t * context_switch(task_t *prev, task_t *next)
8222 + struct mm_struct *mm = next->mm;
8223 + struct mm_struct *oldmm = prev->active_mm;
8225 - timeout = expire - jiffies;
8226 + if (unlikely(!mm)) {
8227 + next->active_mm = oldmm;
8228 + atomic_inc(&oldmm->mm_count);
8229 + enter_lazy_tlb(oldmm, next, smp_processor_id());
8231 + switch_mm(oldmm, mm, next, smp_processor_id());
8234 - return timeout < 0 ? 0 : timeout;
8235 + if (unlikely(!prev->mm)) {
8236 + prev->active_mm = NULL;
8240 + /* Here we just switch the register state and the stack. */
8241 + switch_to(prev, next, prev);
8247 - * schedule_tail() is getting called from the fork return path. This
8248 - * cleans up all remaining scheduler things, without impacting the
8251 -static inline void __schedule_tail(struct task_struct *prev)
8252 +unsigned long nr_running(void)
8256 + unsigned long i, sum = 0;
8259 - * prev->policy can be written from here only before `prev'
8260 - * can be scheduled (before setting prev->cpus_runnable to ~0UL).
8261 - * Of course it must also be read before allowing prev
8262 - * to be rescheduled, but since the write depends on the read
8263 - * to complete, wmb() is enough. (the spin_lock() acquired
8264 - * before setting cpus_runnable is not enough because the spin_lock()
8265 - * common code semantics allows code outside the critical section
8266 - * to enter inside the critical section)
8268 - policy = prev->policy;
8269 - prev->policy = policy & ~SCHED_YIELD;
8271 + for (i = 0; i < smp_num_cpus; i++)
8272 + sum += cpu_rq(cpu_logical_map(i))->nr_running;
8275 - * fast path falls through. We have to clear cpus_runnable before
8276 - * checking prev->state to avoid a wakeup race. Protect against
8277 - * the task exiting early.
8280 - task_release_cpu(prev);
8282 - if (prev->state == TASK_RUNNING)
8283 - goto needs_resched;
8288 - task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */
8290 +/* Note: the per-cpu information is useful only to get the cumulative result */
8291 +unsigned long nr_uninterruptible(void)
8293 + unsigned long i, sum = 0;
8296 - * Slow path - we 'push' the previous process and
8297 - * reschedule_idle() will attempt to find a new
8298 - * processor for it. (but it might preempt the
8299 - * current process as well.) We must take the runqueue
8300 - * lock and re-check prev->state to be correct. It might
8301 - * still happen that this process has a preemption
8302 - * 'in progress' already - but this is not a problem and
8303 - * might happen in other circumstances as well.
8307 - unsigned long flags;
8308 + for (i = 0; i < smp_num_cpus; i++)
8309 + sum += cpu_rq(cpu_logical_map(i))->nr_uninterruptible;
8312 - * Avoid taking the runqueue lock in cases where
8313 - * no preemption-check is necessery:
8315 - if ((prev == idle_task(smp_processor_id())) ||
8316 - (policy & SCHED_YIELD))
8321 - spin_lock_irqsave(&runqueue_lock, flags);
8322 - if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev))
8323 - reschedule_idle(prev);
8324 - spin_unlock_irqrestore(&runqueue_lock, flags);
8328 - prev->policy &= ~SCHED_YIELD;
8329 -#endif /* CONFIG_SMP */
8330 +unsigned long nr_context_switches(void)
8332 + unsigned long i, sum = 0;
8334 + for (i = 0; i < smp_num_cpus; i++)
8335 + sum += cpu_rq(cpu_logical_map(i))->nr_switches;
8340 -asmlinkage void schedule_tail(struct task_struct *prev)
8341 +inline int idle_cpu(int cpu)
8343 - __schedule_tail(prev);
8344 + return cpu_curr(cpu) == cpu_rq(cpu)->idle;
8349 - * 'schedule()' is the scheduler function. It's a very simple and nice
8350 - * scheduler: it's not perfect, but certainly works for most things.
8352 - * The goto is "interesting".
8354 - * NOTE!! Task 0 is the 'idle' task, which gets called when no other
8355 - * tasks can run. It can not be killed, and it cannot sleep. The 'state'
8356 - * information in task[0] is never used.
8357 + * Lock the busiest runqueue as well, this_rq is locked already.
8358 + * Recalculate nr_running if we have to drop the runqueue lock.
8360 -asmlinkage void schedule(void)
8361 +static inline unsigned int double_lock_balance(runqueue_t *this_rq,
8362 + runqueue_t *busiest, int this_cpu, int idle, unsigned int nr_running)
8364 - struct schedule_data * sched_data;
8365 - struct task_struct *prev, *next, *p;
8366 - struct list_head *tmp;
8368 + if (unlikely(!spin_trylock(&busiest->lock))) {
8369 + if (busiest < this_rq) {
8370 + spin_unlock(&this_rq->lock);
8371 + spin_lock(&busiest->lock);
8372 + spin_lock(&this_rq->lock);
8373 + /* Need to recalculate nr_running */
8374 + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
8375 + nr_running = this_rq->nr_running;
8377 + nr_running = this_rq->prev_nr_running[this_cpu];
8379 + spin_lock(&busiest->lock);
8381 + return nr_running;
8385 + * Move a task from a remote runqueue to the local runqueue.
8386 + * Both runqueues must be locked.
8388 +static inline int pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu)
8392 - spin_lock_prefetch(&runqueue_lock);
8393 + dequeue_task(p, src_array);
8394 + src_rq->nr_running--;
8395 + p->cpu = this_cpu;
8396 + this_rq->nr_running++;
8397 + enqueue_task(p, this_rq->active);
8399 + * Note that idle threads have a prio of MAX_PRIO, for this test
8400 + * to be always true for them.
8402 + if (p->prio < this_rq->curr->prio)
8405 - BUG_ON(!current->active_mm);
8408 - this_cpu = prev->processor;
8412 - if (unlikely(in_interrupt())) {
8413 - printk("Scheduling in interrupt\n");
8415 +static inline int idle_cpu_reschedule(task_t * p, int cpu)
8417 + if (unlikely(!(p->cpus_allowed & (1UL << cpu))))
8419 + return idle_cpu(cpu);
8422 +#include <linux/smp_balance.h>
8424 +static int reschedule_idle(task_t * p)
8426 + int p_cpu = p->cpu, i;
8428 + if (idle_cpu(p_cpu))
8431 + p_cpu = cpu_number_map(p_cpu);
8433 + for (i = (p_cpu + 1) % smp_num_cpus;
8435 + i = (i + 1) % smp_num_cpus) {
8436 + int physical = cpu_logical_map(i);
8438 + if (idle_cpu_reschedule(p, physical)) {
8439 + physical = arch_reschedule_idle_override(p, physical);
8440 + p->cpu = physical;
8445 - release_kernel_lock(prev, this_cpu);
8450 + * Current runqueue is empty, or rebalance tick: if there is an
8451 + * inbalance (current runqueue is too short) then pull from
8452 + * busiest runqueue(s).
8454 + * We call this with the current runqueue locked,
8457 +static void load_balance(runqueue_t *this_rq, int idle)
8459 + int imbalance, nr_running, load, max_load,
8460 + idx, i, this_cpu = this_rq - runqueues;
8462 + runqueue_t *busiest, *rq_src;
8463 + prio_array_t *array;
8464 + struct list_head *head, *curr;
8468 - * 'sched_data' is protected by the fact that we can run
8469 - * only one process per CPU.
8470 + * Handle architecture-specific balancing, such as hyperthreading.
8472 - sched_data = & aligned_data[this_cpu].schedule_data;
8473 + if (arch_load_balance(this_cpu, idle))
8476 - spin_lock_irq(&runqueue_lock);
8479 + * We search all runqueues to find the most busy one.
8480 + * We do this lockless to reduce cache-bouncing overhead,
8481 + * we re-check the 'best' source CPU later on again, with
8484 + * We fend off statistical fluctuations in runqueue lengths by
8485 + * saving the runqueue length during the previous load-balancing
8486 + * operation and using the smaller one the current and saved lengths.
8487 + * If a runqueue is long enough for a longer amount of time then
8488 + * we recognize it and pull tasks from it.
8490 + * The 'current runqueue length' is a statistical maximum variable,
8491 + * for that one we take the longer one - to avoid fluctuations in
8492 + * the other direction. So for a load-balance to happen it needs
8493 + * stable long runqueue on the target CPU and stable short runqueue
8494 + * on the local runqueue.
8496 + * We make an exception if this CPU is about to become idle - in
8497 + * that case we are less picky about moving a task across CPUs and
8498 + * take what can be taken.
8500 + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
8501 + nr_running = this_rq->nr_running;
8503 + nr_running = this_rq->prev_nr_running[this_cpu];
8505 - /* move an exhausted RR process to be last.. */
8506 - if (unlikely(prev->policy == SCHED_RR))
8507 - if (!prev->counter) {
8508 - prev->counter = NICE_TO_TICKS(prev->nice);
8509 - move_last_runqueue(prev);
8513 + for (i = 0; i < smp_num_cpus; i++) {
8514 + int logical = cpu_logical_map(i);
8516 - switch (prev->state) {
8517 - case TASK_INTERRUPTIBLE:
8518 - if (signal_pending(prev)) {
8519 - prev->state = TASK_RUNNING;
8523 - del_from_runqueue(prev);
8524 - case TASK_RUNNING:;
8525 + rq_src = cpu_rq(logical);
8526 + if (idle || (rq_src->nr_running < this_rq->prev_nr_running[logical]))
8527 + load = rq_src->nr_running;
8529 + load = this_rq->prev_nr_running[logical];
8530 + this_rq->prev_nr_running[logical] = rq_src->nr_running;
8532 + if ((load > max_load) && (rq_src != this_rq)) {
8537 - prev->need_resched = 0;
8539 + if (likely(!busiest))
8542 + imbalance = (max_load - nr_running) / 2;
8544 + /* It needs an at least ~25% imbalance to trigger balancing. */
8545 + if (!idle && (imbalance < (max_load + 3)/4))
8549 - * this is the scheduler proper:
8550 + * Make sure nothing significant changed since we checked the
8551 + * runqueue length.
8553 + if (double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running) > nr_running ||
8554 + busiest->nr_running < max_load)
8555 + goto out_unlock_retry;
8559 - * Default process to select..
8560 + * We first consider expired tasks. Those will likely not be
8561 + * executed in the near future, and they are most likely to
8562 + * be cache-cold, thus switching CPUs has the least effect
8565 - next = idle_task(this_cpu);
8567 - list_for_each(tmp, &runqueue_head) {
8568 - p = list_entry(tmp, struct task_struct, run_list);
8569 - if (can_schedule(p, this_cpu)) {
8570 - int weight = goodness(p, this_cpu, prev->active_mm);
8572 - c = weight, next = p;
8573 + if (busiest->expired->nr_active)
8574 + array = busiest->expired;
8576 + array = busiest->active;
8580 + /* Start searching at priority 0: */
8584 + idx = sched_find_first_bit(array->bitmap);
8586 + idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
8587 + if (idx == MAX_PRIO) {
8588 + if (array == busiest->expired) {
8589 + array = busiest->active;
8595 - /* Do we need to re-calculate counters? */
8596 - if (unlikely(!c)) {
8597 - struct task_struct *p;
8599 - spin_unlock_irq(&runqueue_lock);
8600 - read_lock(&tasklist_lock);
8602 - p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
8603 - read_unlock(&tasklist_lock);
8604 - spin_lock_irq(&runqueue_lock);
8605 - goto repeat_schedule;
8606 + head = array->queue + idx;
8607 + curr = head->prev;
8609 + tmp = list_entry(curr, task_t, run_list);
8612 + * We do not migrate tasks that are:
8613 + * 1) running (obviously), or
8614 + * 2) cannot be migrated to this CPU due to cpus_allowed, or
8615 + * 3) are cache-hot on their current CPU.
8618 +#define CAN_MIGRATE_TASK(p,rq,this_cpu) \
8619 + ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \
8620 + ((p) != (rq)->curr) && \
8621 + ((p)->cpus_allowed & (1UL << (this_cpu))))
8623 + curr = curr->prev;
8625 + if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
8631 + resched |= pull_task(busiest, array, tmp, this_rq, this_cpu);
8632 + if (--imbalance > 0) {
8639 + spin_unlock(&busiest->lock);
8641 + resched_task(this_rq->curr);
8644 + spin_unlock(&busiest->lock);
8649 - * from this point on nothing can prevent us from
8650 - * switching to the next task, save this fact in
8653 - sched_data->curr = next;
8654 - task_set_cpu(next, this_cpu);
8655 - spin_unlock_irq(&runqueue_lock);
8657 - if (unlikely(prev == next)) {
8658 - /* We won't go through the normal tail, so do this by hand */
8659 - prev->policy &= ~SCHED_YIELD;
8660 - goto same_process;
8662 + * One of the idle_cpu_tick() or the busy_cpu_tick() function will
8663 + * gets called every timer tick, on every CPU. Our balancing action
8664 + * frequency and balancing agressivity depends on whether the CPU is
8667 + * busy-rebalance every 250 msecs. idle-rebalance every 100 msec.
8669 +#define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
8670 +#define IDLE_REBALANCE_TICK (HZ/10 ?: 1)
8672 +static inline void idle_tick(void)
8674 + if (unlikely(time_before_eq(this_rq()->last_jiffy + IDLE_REBALANCE_TICK, jiffies))) {
8675 + spin_lock(&this_rq()->lock);
8676 + load_balance(this_rq(), 1);
8677 + spin_unlock(&this_rq()->lock);
8678 + this_rq()->last_jiffy = jiffies;
8684 - * maintain the per-process 'last schedule' value.
8685 - * (this has to be recalculated even if we reschedule to
8686 - * the same process) Currently this is only used on SMP,
8687 - * and it's approximate, so we do not have to maintain
8688 - * it while holding the runqueue spinlock.
8690 - sched_data->last_schedule = get_cycles();
8694 - * We drop the scheduler lock early (it's a global spinlock),
8695 - * thus we have to lock the previous process from getting
8696 - * rescheduled during switch_to().
8699 + * We place interactive tasks back into the active array, if possible.
8701 + * To guarantee that this does not starve expired tasks we ignore the
8702 + * interactivity of a task if the first expired task had to wait more
8703 + * than a 'reasonable' amount of time. This deadline timeout is
8704 + * load-dependent, as the frequency of array switched decreases with
8705 + * increasing number of running tasks:
8707 +#define EXPIRED_STARVING(rq) \
8708 + ((rq)->expired_timestamp && \
8709 + (jiffies - (rq)->expired_timestamp >= \
8710 + STARVATION_LIMIT * ((rq)->nr_running) + 1))
8712 -#endif /* CONFIG_SMP */
8714 + * This function gets called by the timer code, with HZ frequency.
8715 + * We call it with interrupts disabled.
8717 +void scheduler_tick(int user_tick, int system)
8719 + int cpu = smp_processor_id();
8720 + runqueue_t *rq = this_rq();
8721 + task_t *p = current;
8723 - kstat.context_swtch++;
8725 - * there are 3 processes which are affected by a context switch:
8727 - * prev == .... ==> (last => next)
8729 - * It's the 'much more previous' 'prev' that is on next's stack,
8730 - * but prev is set to (the just run) 'last' process by switch_to().
8731 - * This might sound slightly confusing but makes tons of sense.
8733 - prepare_to_switch();
8735 - struct mm_struct *mm = next->mm;
8736 - struct mm_struct *oldmm = prev->active_mm;
8738 - BUG_ON(next->active_mm);
8739 - next->active_mm = oldmm;
8740 - atomic_inc(&oldmm->mm_count);
8741 - enter_lazy_tlb(oldmm, next, this_cpu);
8743 - BUG_ON(next->active_mm != mm);
8744 - switch_mm(oldmm, mm, next, this_cpu);
8745 + if (p == rq->idle) {
8746 + if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
8747 + kstat.per_cpu_system[cpu] += system;
8753 + if (TASK_NICE(p) > 0)
8754 + kstat.per_cpu_nice[cpu] += user_tick;
8756 + kstat.per_cpu_user[cpu] += user_tick;
8757 + kstat.per_cpu_system[cpu] += system;
8759 + /* Task might have expired already, but not scheduled off yet */
8760 + if (p->array != rq->active) {
8761 + set_tsk_need_resched(p);
8764 + spin_lock(&rq->lock);
8765 + if (unlikely(rt_task(p))) {
8767 + * RR tasks need a special form of timeslice management.
8768 + * FIFO tasks have no timeslices.
8770 + if ((p->policy == SCHED_RR) && !--p->time_slice) {
8771 + p->time_slice = TASK_TIMESLICE(p);
8772 + p->first_time_slice = 0;
8773 + set_tsk_need_resched(p);
8775 + /* put it at the end of the queue: */
8776 + dequeue_task(p, rq->active);
8777 + enqueue_task(p, rq->active);
8782 + * The task was running during this tick - update the
8783 + * time slice counter and the sleep average. Note: we
8784 + * do not update a process's priority until it either
8785 + * goes to sleep or uses up its timeslice. This makes
8786 + * it possible for interactive tasks to use up their
8787 + * timeslices at their highest priority levels.
8791 + if (!--p->time_slice) {
8792 + dequeue_task(p, rq->active);
8793 + set_tsk_need_resched(p);
8794 + p->prio = effective_prio(p);
8795 + p->time_slice = TASK_TIMESLICE(p);
8796 + p->first_time_slice = 0;
8798 + if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
8799 + if (!rq->expired_timestamp)
8800 + rq->expired_timestamp = jiffies;
8801 + enqueue_task(p, rq->expired);
8803 + enqueue_task(p, rq->active);
8807 + if (unlikely(time_before_eq(this_rq()->last_jiffy + BUSY_REBALANCE_TICK, jiffies))) {
8808 + load_balance(rq, 0);
8809 + rq->last_jiffy = jiffies;
8812 + spin_unlock(&rq->lock);
8815 +void scheduling_functions_start_here(void) { }
8818 + * 'schedule()' is the main scheduler function.
8820 +asmlinkage void schedule(void)
8822 + task_t *prev, *next;
8824 + prio_array_t *array;
8825 + struct list_head *queue;
8828 + if (unlikely(in_interrupt()))
8832 - prev->active_mm = NULL;
8838 + release_kernel_lock(prev, smp_processor_id());
8839 + prev->sleep_timestamp = jiffies;
8840 + spin_lock_irq(&rq->lock);
8842 + switch (prev->state) {
8843 + case TASK_INTERRUPTIBLE:
8844 + if (unlikely(signal_pending(prev))) {
8845 + prev->state = TASK_RUNNING;
8849 + deactivate_task(prev, rq);
8850 + case TASK_RUNNING:
8856 + if (unlikely(!rq->nr_running)) {
8858 + load_balance(rq, 2);
8859 + rq->last_jiffy = jiffies;
8860 + if (rq->nr_running)
8861 + goto pick_next_task;
8864 + rq->expired_timestamp = 0;
8865 + goto switch_tasks;
8869 - * This just switches the register state and the
8872 - switch_to(prev, next, prev);
8873 - __schedule_tail(prev);
8874 + array = rq->active;
8875 + if (unlikely(!array->nr_active)) {
8877 + * Switch the active and expired arrays.
8879 + rq->active = rq->expired;
8880 + rq->expired = array;
8881 + array = rq->active;
8882 + rq->expired_timestamp = 0;
8885 + idx = sched_find_first_bit(array->bitmap);
8886 + queue = array->queue + idx;
8887 + next = list_entry(queue->next, task_t, run_list);
8891 + clear_tsk_need_resched(prev);
8893 + if (likely(prev != next)) {
8894 + rq->nr_switches++;
8897 + prepare_arch_switch(rq, next);
8898 + prev = context_switch(prev, next);
8901 + finish_arch_switch(rq, prev);
8903 + spin_unlock_irq(&rq->lock);
8906 reacquire_kernel_lock(current);
8907 - if (current->need_resched)
8908 - goto need_resched_back;
8910 + if (need_resched())
8911 + goto need_resched;
8915 - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything
8916 - * up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the
8917 - * non-exclusive tasks and one exclusive task.
8918 + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
8919 + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
8920 + * number) then we wake all the non-exclusive tasks and one exclusive task.
8922 * There are circumstances in which we can try to wake a task which has already
8923 - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero
8924 - * in this (rare) case, and we handle it by contonuing to scan the queue.
8925 + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
8926 + * zero in this (rare) case, and we handle it by continuing to scan the queue.
8928 -static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
8929 - int nr_exclusive, const int sync)
8930 +static inline void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync)
8932 struct list_head *tmp;
8933 - struct task_struct *p;
8935 - CHECK_MAGIC_WQHEAD(q);
8936 - WQ_CHECK_LIST_HEAD(&q->task_list);
8938 - list_for_each(tmp,&q->task_list) {
8939 - unsigned int state;
8940 - wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
8941 + unsigned int state;
8942 + wait_queue_t *curr;
8945 - CHECK_MAGIC(curr->__magic);
8946 + list_for_each(tmp, &q->task_list) {
8947 + curr = list_entry(tmp, wait_queue_t, task_list);
8950 - if (state & mode) {
8951 - WQ_NOTE_WAKER(curr);
8952 - if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
8953 + if ((state & mode) && try_to_wake_up(p, sync) &&
8954 + ((curr->flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive))
8960 -void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr)
8961 +void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
8964 - unsigned long flags;
8965 - wq_read_lock_irqsave(&q->lock, flags);
8966 - __wake_up_common(q, mode, nr, 0);
8967 - wq_read_unlock_irqrestore(&q->lock, flags);
8969 + unsigned long flags;
8974 + wq_read_lock_irqsave(&q->lock, flags);
8975 + __wake_up_common(q, mode, nr_exclusive, 0);
8976 + wq_read_unlock_irqrestore(&q->lock, flags);
8979 -void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr)
8982 +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
8985 - unsigned long flags;
8986 - wq_read_lock_irqsave(&q->lock, flags);
8987 - __wake_up_common(q, mode, nr, 1);
8988 - wq_read_unlock_irqrestore(&q->lock, flags);
8990 + unsigned long flags;
8995 + wq_read_lock_irqsave(&q->lock, flags);
8996 + if (likely(nr_exclusive))
8997 + __wake_up_common(q, mode, nr_exclusive, 1);
8999 + __wake_up_common(q, mode, nr_exclusive, 0);
9000 + wq_read_unlock_irqrestore(&q->lock, flags);
9005 void complete(struct completion *x)
9007 unsigned long flags;
9009 - spin_lock_irqsave(&x->wait.lock, flags);
9010 + wq_write_lock_irqsave(&x->wait.lock, flags);
9012 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, 0);
9013 - spin_unlock_irqrestore(&x->wait.lock, flags);
9014 + wq_write_unlock_irqrestore(&x->wait.lock, flags);
9017 void wait_for_completion(struct completion *x)
9019 - spin_lock_irq(&x->wait.lock);
9020 + wq_write_lock_irq(&x->wait.lock);
9022 DECLARE_WAITQUEUE(wait, current);
9024 @@ -775,14 +1060,14 @@
9025 __add_wait_queue_tail(&x->wait, &wait);
9027 __set_current_state(TASK_UNINTERRUPTIBLE);
9028 - spin_unlock_irq(&x->wait.lock);
9029 + wq_write_unlock_irq(&x->wait.lock);
9031 - spin_lock_irq(&x->wait.lock);
9032 + wq_write_lock_irq(&x->wait.lock);
9034 __remove_wait_queue(&x->wait, &wait);
9037 - spin_unlock_irq(&x->wait.lock);
9038 + wq_write_unlock_irq(&x->wait.lock);
9041 #define SLEEP_ON_VAR \
9042 @@ -850,43 +1135,40 @@
9044 void scheduling_functions_end_here(void) { }
9048 - * set_cpus_allowed() - change a given task's processor affinity
9049 - * @p: task to bind
9050 - * @new_mask: bitmask of allowed processors
9052 - * Upon return, the task is running on a legal processor. Note the caller
9053 - * must have a valid reference to the task: it must not exit() prematurely.
9054 - * This call can sleep; do not hold locks on call.
9056 -void set_cpus_allowed(struct task_struct *p, unsigned long new_mask)
9057 +void set_user_nice(task_t *p, long nice)
9059 - new_mask &= cpu_online_map;
9060 - BUG_ON(!new_mask);
9062 - p->cpus_allowed = new_mask;
9063 + unsigned long flags;
9064 + prio_array_t *array;
9067 + if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
9070 - * If the task is on a no-longer-allowed processor, we need to move
9071 - * it. If the task is not current, then set need_resched and send
9072 - * its processor an IPI to reschedule.
9073 + * We have to be careful, if called from sys_setpriority(),
9074 + * the task might be in the middle of scheduling on another CPU.
9076 - if (!(p->cpus_runnable & p->cpus_allowed)) {
9077 - if (p != current) {
9078 - p->need_resched = 1;
9079 - smp_send_reschedule(p->processor);
9081 + rq = task_rq_lock(p, &flags);
9083 + p->static_prio = NICE_TO_PRIO(nice);
9088 + dequeue_task(p, array);
9089 + p->static_prio = NICE_TO_PRIO(nice);
9090 + p->prio = NICE_TO_PRIO(nice);
9092 + enqueue_task(p, array);
9094 - * Wait until we are on a legal processor. If the task is
9095 - * current, then we should be on a legal processor the next
9096 - * time we reschedule. Otherwise, we need to wait for the IPI.
9097 + * If the task is running and lowered its priority,
9098 + * or increased its priority then reschedule its CPU:
9100 - while (!(p->cpus_runnable & p->cpus_allowed))
9102 + if (p == rq->curr)
9103 + resched_task(rq->curr);
9106 + task_rq_unlock(rq, &flags);
9108 -#endif /* CONFIG_SMP */
9112 @@ -898,7 +1180,7 @@
9114 asmlinkage long sys_nice(int increment)
9120 * Setpriority might change our priority at the same moment.
9121 @@ -914,32 +1196,46 @@
9125 - newprio = current->nice + increment;
9126 - if (newprio < -20)
9130 - current->nice = newprio;
9131 + nice = PRIO_TO_NICE(current->static_prio) + increment;
9136 + set_user_nice(current, nice);
9142 -static inline struct task_struct *find_process_by_pid(pid_t pid)
9144 + * This is the priority value as seen by users in /proc
9146 + * RT tasks are offset by -200. Normal tasks are centered
9147 + * around 0, value goes from -16 to +15.
9149 +int task_prio(task_t *p)
9151 - struct task_struct *tsk = current;
9152 + return p->prio - MAX_USER_RT_PRIO;
9156 - tsk = find_task_by_pid(pid);
9158 +int task_nice(task_t *p)
9160 + return TASK_NICE(p);
9163 +static inline task_t *find_process_by_pid(pid_t pid)
9165 + return pid ? find_task_by_pid(pid) : current;
9168 -static int setscheduler(pid_t pid, int policy,
9169 - struct sched_param *param)
9170 +static int setscheduler(pid_t pid, int policy, struct sched_param *param)
9172 struct sched_param lp;
9173 - struct task_struct *p;
9174 + prio_array_t *array;
9175 + unsigned long flags;
9181 if (!param || pid < 0)
9182 @@ -953,14 +1249,19 @@
9183 * We play safe to avoid deadlocks.
9185 read_lock_irq(&tasklist_lock);
9186 - spin_lock(&runqueue_lock);
9188 p = find_process_by_pid(pid);
9194 + goto out_unlock_tasklist;
9197 + * To be able to change p->policy safely, the apropriate
9198 + * runqueue lock must be held.
9200 + rq = task_rq_lock(p, &flags);
9205 @@ -969,40 +1270,48 @@
9206 policy != SCHED_OTHER)
9212 - * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid
9213 - * priority for SCHED_OTHER is 0.
9214 + * Valid priorities for SCHED_FIFO and SCHED_RR are
9215 + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_OTHER is 0.
9218 - if (lp.sched_priority < 0 || lp.sched_priority > 99)
9219 + if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
9221 if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
9225 - if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
9226 + if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
9227 !capable(CAP_SYS_NICE))
9229 if ((current->euid != p->euid) && (current->euid != p->uid) &&
9230 !capable(CAP_SYS_NICE))
9235 + deactivate_task(p, task_rq(p));
9238 p->rt_priority = lp.sched_priority;
9240 - current->need_resched = 1;
9241 + if (policy != SCHED_OTHER)
9242 + p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
9244 + p->prio = p->static_prio;
9246 + activate_task(p, task_rq(p));
9249 - spin_unlock(&runqueue_lock);
9250 + task_rq_unlock(rq, &flags);
9251 +out_unlock_tasklist:
9252 read_unlock_irq(&tasklist_lock);
9258 -asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
9259 +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
9260 struct sched_param *param)
9262 return setscheduler(pid, policy, param);
9263 @@ -1015,7 +1324,7 @@
9265 asmlinkage long sys_sched_getscheduler(pid_t pid)
9267 - struct task_struct *p;
9272 @@ -1026,7 +1335,7 @@
9273 read_lock(&tasklist_lock);
9274 p = find_process_by_pid(pid);
9276 - retval = p->policy & ~SCHED_YIELD;
9277 + retval = p->policy;
9278 read_unlock(&tasklist_lock);
9281 @@ -1035,7 +1344,7 @@
9283 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
9285 - struct task_struct *p;
9287 struct sched_param lp;
9290 @@ -1066,42 +1375,64 @@
9292 asmlinkage long sys_sched_yield(void)
9295 - * Trick. sched_yield() first counts the number of truly
9296 - * 'pending' runnable processes, then returns if it's
9297 - * only the current processes. (This test does not have
9298 - * to be atomic.) In threaded applications this optimization
9299 - * gets triggered quite often.
9301 + runqueue_t *rq = this_rq();
9302 + prio_array_t *array;
9305 - int nr_pending = nr_running;
9306 + spin_lock_irq(&rq->lock);
9308 + if (unlikely(rq->nr_running == 1)) {
9309 + spin_unlock_irq(&rq->lock);
9315 + array = current->array;
9316 + if (unlikely(rt_task(current))) {
9317 + list_del(¤t->run_list);
9318 + list_add_tail(¤t->run_list, array->queue + current->prio);
9322 - // Subtract non-idle processes running on other CPUs.
9323 - for (i = 0; i < smp_num_cpus; i++) {
9324 - int cpu = cpu_logical_map(i);
9325 - if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
9327 + if (unlikely(array == rq->expired) && rq->active->nr_active)
9330 + list_del(¤t->run_list);
9331 + if (!list_empty(array->queue + current->prio)) {
9332 + list_add(¤t->run_list, array->queue[current->prio].next);
9336 - // on UP this process is on the runqueue as well
9341 + __clear_bit(current->prio, array->bitmap);
9342 + if (likely(array == rq->active) && array->nr_active == 1) {
9344 - * This process can only be rescheduled by us,
9345 - * so this is safe without any locking.
9346 + * We're the last task in the active queue so
9347 + * we must move ourself to the expired array
9348 + * to avoid running again immediatly.
9350 - if (current->policy == SCHED_OTHER)
9351 - current->policy |= SCHED_YIELD;
9352 - current->need_resched = 1;
9354 - spin_lock_irq(&runqueue_lock);
9355 - move_last_runqueue(current);
9356 - spin_unlock_irq(&runqueue_lock);
9357 + array->nr_active--;
9358 + array = rq->expired;
9359 + array->nr_active++;
9362 + i = sched_find_first_bit(array->bitmap);
9364 + BUG_ON(i == MAX_PRIO);
9365 + BUG_ON(i == current->prio && array == current->array);
9367 + if (array == current->array && i < current->prio)
9368 + i = current->prio;
9370 + current->array = array;
9371 + current->prio = i;
9373 + list_add(¤t->run_list, array->queue[i].next);
9374 + __set_bit(i, array->bitmap);
9377 + spin_unlock_irq(&rq->lock);
9384 @@ -1113,14 +1444,13 @@
9388 - set_current_state(TASK_RUNNING);
9389 + __set_current_state(TASK_RUNNING);
9394 void __cond_resched(void)
9396 - set_current_state(TASK_RUNNING);
9397 + __set_current_state(TASK_RUNNING);
9401 @@ -1131,7 +1461,7 @@
9406 + ret = MAX_USER_RT_PRIO-1;
9410 @@ -1158,7 +1488,7 @@
9411 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
9414 - struct task_struct *p;
9416 int retval = -EINVAL;
9419 @@ -1168,8 +1498,8 @@
9420 read_lock(&tasklist_lock);
9421 p = find_process_by_pid(pid);
9423 - jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice),
9425 + jiffies_to_timespec(p->policy & SCHED_FIFO ?
9426 + 0 : TASK_TIMESLICE(p), &t);
9427 read_unlock(&tasklist_lock);
9429 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
9430 @@ -1177,14 +1507,14 @@
9434 -static void show_task(struct task_struct * p)
9435 +static void show_task(task_t * p)
9437 unsigned long free = 0;
9439 static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
9441 printk("%-13.13s ", p->comm);
9442 - state = p->state ? ffz(~p->state) + 1 : 0;
9443 + state = p->state ? __ffs(p->state) + 1 : 0;
9444 if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
9445 printk(stat_nam[state]);
9447 @@ -1225,7 +1555,7 @@
9448 printk(" (NOTLB)\n");
9451 - extern void show_trace_task(struct task_struct *tsk);
9452 + extern void show_trace_task(task_t *tsk);
9456 @@ -1247,7 +1577,7 @@
9458 void show_state(void)
9460 - struct task_struct *p;
9463 #if (BITS_PER_LONG == 32)
9465 @@ -1270,128 +1600,280 @@
9466 read_unlock(&tasklist_lock);
9470 - * reparent_to_init() - Reparent the calling kernel thread to the init task.
9472 - * If a kernel thread is launched as a result of a system call, or if
9473 - * it ever exits, it should generally reparent itself to init so that
9474 - * it is correctly cleaned up on exit.
9476 + * double_rq_lock - safely lock two runqueues
9478 - * The various task state such as scheduling policy and priority may have
9479 - * been inherited fro a user process, so we reset them to sane values here.
9480 + * Note this does not disable interrupts like task_rq_lock,
9481 + * you need to do so manually before calling.
9483 +static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
9486 + spin_lock(&rq1->lock);
9489 + spin_lock(&rq1->lock);
9490 + spin_lock(&rq2->lock);
9492 + spin_lock(&rq2->lock);
9493 + spin_lock(&rq1->lock);
9499 + * double_rq_unlock - safely unlock two runqueues
9501 - * NOTE that reparent_to_init() gives the caller full capabilities.
9502 + * Note this does not restore interrupts like task_rq_unlock,
9503 + * you need to do so manually after calling.
9505 -void reparent_to_init(void)
9506 +static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
9508 - struct task_struct *this_task = current;
9509 + spin_unlock(&rq1->lock);
9511 + spin_unlock(&rq2->lock);
9514 - write_lock_irq(&tasklist_lock);
9515 +void __init init_idle(task_t *idle, int cpu)
9517 + runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(idle->cpu);
9518 + unsigned long flags;
9520 - /* Reparent to init */
9521 - REMOVE_LINKS(this_task);
9522 - this_task->p_pptr = child_reaper;
9523 - this_task->p_opptr = child_reaper;
9524 - SET_LINKS(this_task);
9525 + __save_flags(flags);
9527 + double_rq_lock(idle_rq, rq);
9529 + idle_rq->curr = idle_rq->idle = idle;
9530 + deactivate_task(idle, rq);
9531 + idle->array = NULL;
9532 + idle->prio = MAX_PRIO;
9533 + idle->state = TASK_RUNNING;
9535 + double_rq_unlock(idle_rq, rq);
9536 + set_tsk_need_resched(idle);
9537 + __restore_flags(flags);
9540 +extern void init_timervecs(void);
9541 +extern void timer_bh(void);
9542 +extern void tqueue_bh(void);
9543 +extern void immediate_bh(void);
9545 +void __init sched_init(void)
9550 + for (i = 0; i < NR_CPUS; i++) {
9551 + prio_array_t *array;
9553 - /* Set the exit signal to SIGCHLD so we signal init on exit */
9554 - this_task->exit_signal = SIGCHLD;
9556 + rq->active = rq->arrays;
9557 + rq->expired = rq->arrays + 1;
9558 + spin_lock_init(&rq->lock);
9560 + INIT_LIST_HEAD(&rq->migration_queue);
9563 - /* We also take the runqueue_lock while altering task fields
9564 - * which affect scheduling decisions */
9565 - spin_lock(&runqueue_lock);
9566 + for (j = 0; j < 2; j++) {
9567 + array = rq->arrays + j;
9568 + for (k = 0; k < MAX_PRIO; k++) {
9569 + INIT_LIST_HEAD(array->queue + k);
9570 + __clear_bit(k, array->bitmap);
9572 + // delimiter for bitsearch
9573 + __set_bit(MAX_PRIO, array->bitmap);
9577 + * We have to do a little magic to get the first
9578 + * process right in SMP mode.
9581 + rq->curr = current;
9582 + rq->idle = current;
9583 + current->cpu = smp_processor_id();
9584 + wake_up_process(current);
9586 - this_task->ptrace = 0;
9587 - this_task->nice = DEF_NICE;
9588 - this_task->policy = SCHED_OTHER;
9589 - /* cpus_allowed? */
9590 - /* rt_priority? */
9592 - this_task->cap_effective = CAP_INIT_EFF_SET;
9593 - this_task->cap_inheritable = CAP_INIT_INH_SET;
9594 - this_task->cap_permitted = CAP_FULL_SET;
9595 - this_task->keep_capabilities = 0;
9596 - memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim)));
9597 - switch_uid(INIT_USER);
9599 + init_bh(TIMER_BH, timer_bh);
9600 + init_bh(TQUEUE_BH, tqueue_bh);
9601 + init_bh(IMMEDIATE_BH, immediate_bh);
9603 - spin_unlock(&runqueue_lock);
9604 - write_unlock_irq(&tasklist_lock);
9606 + * The boot idle thread does lazy MMU switching as well:
9608 + atomic_inc(&init_mm.mm_count);
9609 + enter_lazy_tlb(&init_mm, current, smp_processor_id());
9615 - * Put all the gunge required to become a kernel thread without
9616 - * attached user resources in one place where it belongs.
9618 + * This is how migration works:
9620 + * 1) we queue a migration_req_t structure in the source CPU's
9621 + * runqueue and wake up that CPU's migration thread.
9622 + * 2) we down() the locked semaphore => thread blocks.
9623 + * 3) migration thread wakes up (implicitly it forces the migrated
9624 + * thread off the CPU)
9625 + * 4) it gets the migration request and checks whether the migrated
9626 + * task is still in the wrong runqueue.
9627 + * 5) if it's in the wrong runqueue then the migration thread removes
9628 + * it and puts it into the right queue.
9629 + * 6) migration thread up()s the semaphore.
9630 + * 7) we wake up and the migration is done.
9634 + struct list_head list;
9636 + struct completion done;
9639 -void daemonize(void)
9641 + * Change a given task's CPU affinity. Migrate the process to a
9642 + * proper CPU and schedule it away if the CPU it's executing on
9643 + * is removed from the allowed bitmask.
9645 + * NOTE: the caller must have a valid reference to the task, the
9646 + * task must not exit() & deallocate itself prematurely. The
9647 + * call is not atomic; no spinlocks may be held.
9649 +void set_cpus_allowed(task_t *p, unsigned long new_mask)
9651 - struct fs_struct *fs;
9652 + unsigned long flags;
9653 + migration_req_t req;
9656 + new_mask &= cpu_online_map;
9660 + rq = task_rq_lock(p, &flags);
9661 + p->cpus_allowed = new_mask;
9663 - * If we were started as result of loading a module, close all of the
9664 - * user space pages. We don't need them, and if we didn't close them
9665 - * they would be locked into memory.
9666 + * Can the task run on the task's current CPU? If not then
9667 + * migrate the process off to a proper CPU.
9670 + if (new_mask & (1UL << p->cpu)) {
9671 + task_rq_unlock(rq, &flags);
9675 - current->session = 1;
9676 - current->pgrp = 1;
9677 - current->tty = NULL;
9679 + * If the task is not on a runqueue, then it is safe to
9680 + * simply update the task's cpu field.
9682 + if (!p->array && (p != rq->curr)) {
9683 + p->cpu = __ffs(p->cpus_allowed);
9684 + task_rq_unlock(rq, &flags);
9688 - /* Become as one with the init task */
9689 + init_completion(&req.done);
9691 + list_add(&req.list, &rq->migration_queue);
9692 + task_rq_unlock(rq, &flags);
9693 + wake_up_process(rq->migration_thread);
9695 - exit_fs(current); /* current->fs->count--; */
9696 - fs = init_task.fs;
9698 - atomic_inc(&fs->count);
9699 - exit_files(current);
9700 - current->files = init_task.files;
9701 - atomic_inc(¤t->files->count);
9702 + wait_for_completion(&req.done);
9705 -extern unsigned long wait_init_idle;
9706 +static __initdata int master_migration_thread;
9708 -void __init init_idle(void)
9709 +static int migration_thread(void * bind_cpu)
9711 - struct schedule_data * sched_data;
9712 - sched_data = &aligned_data[smp_processor_id()].schedule_data;
9713 + int cpu = cpu_logical_map((int) (long) bind_cpu);
9714 + struct sched_param param = { sched_priority: MAX_RT_PRIO-1 };
9718 - if (current != &init_task && task_on_runqueue(current)) {
9719 - printk("UGH! (%d:%d) was on the runqueue, removing.\n",
9720 - smp_processor_id(), current->pid);
9721 - del_from_runqueue(current);
9723 + sigfillset(¤t->blocked);
9724 + set_fs(KERNEL_DS);
9726 + * The first migration thread is started on the boot CPU, it
9727 + * migrates the other migration threads to their destination CPUs.
9729 + if (cpu != master_migration_thread) {
9730 + while (!cpu_rq(master_migration_thread)->migration_thread)
9732 + set_cpus_allowed(current, 1UL << cpu);
9734 - sched_data->curr = current;
9735 - sched_data->last_schedule = get_cycles();
9736 - clear_bit(current->processor, &wait_init_idle);
9738 + printk("migration_task %d on cpu=%d\n", cpu, smp_processor_id());
9739 + ret = setscheduler(0, SCHED_FIFO, ¶m);
9741 -extern void init_timervecs (void);
9743 + rq->migration_thread = current;
9745 -void __init sched_init(void)
9748 - * We have to do a little magic to get the first
9749 - * process right in SMP mode.
9751 - int cpu = smp_processor_id();
9753 + sprintf(current->comm, "migration_CPU%d", smp_processor_id());
9755 - init_task.processor = cpu;
9757 + runqueue_t *rq_src, *rq_dest;
9758 + struct list_head *head;
9759 + int cpu_src, cpu_dest;
9760 + migration_req_t *req;
9761 + unsigned long flags;
9764 - for(nr = 0; nr < PIDHASH_SZ; nr++)
9765 - pidhash[nr] = NULL;
9766 + spin_lock_irqsave(&rq->lock, flags);
9767 + head = &rq->migration_queue;
9768 + current->state = TASK_INTERRUPTIBLE;
9769 + if (list_empty(head)) {
9770 + spin_unlock_irqrestore(&rq->lock, flags);
9774 + req = list_entry(head->next, migration_req_t, list);
9775 + list_del_init(head->next);
9776 + spin_unlock_irqrestore(&rq->lock, flags);
9779 + cpu_dest = __ffs(p->cpus_allowed);
9780 + rq_dest = cpu_rq(cpu_dest);
9783 + rq_src = cpu_rq(cpu_src);
9785 + local_irq_save(flags);
9786 + double_rq_lock(rq_src, rq_dest);
9787 + if (p->cpu != cpu_src) {
9788 + double_rq_unlock(rq_src, rq_dest);
9789 + local_irq_restore(flags);
9792 + if (rq_src == rq) {
9793 + p->cpu = cpu_dest;
9795 + deactivate_task(p, rq_src);
9796 + activate_task(p, rq_dest);
9799 + double_rq_unlock(rq_src, rq_dest);
9800 + local_irq_restore(flags);
9803 + complete(&req->done);
9807 - init_bh(TIMER_BH, timer_bh);
9808 - init_bh(TQUEUE_BH, tqueue_bh);
9809 - init_bh(IMMEDIATE_BH, immediate_bh);
9810 +void __init migration_init(void)
9815 - * The boot idle thread does lazy MMU switching as well:
9817 - atomic_inc(&init_mm.mm_count);
9818 - enter_lazy_tlb(&init_mm, current, cpu);
9819 + master_migration_thread = smp_processor_id();
9820 + current->cpus_allowed = 1UL << master_migration_thread;
9822 + for (cpu = 0; cpu < smp_num_cpus; cpu++) {
9823 + if (kernel_thread(migration_thread, (void *) (long) cpu,
9824 + CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0)
9827 + current->cpus_allowed = -1L;
9829 + for (cpu = 0; cpu < smp_num_cpus; cpu++)
9830 + while (!cpu_rq(cpu_logical_map(cpu))->migration_thread)
9831 + schedule_timeout(2);
9834 +#endif /* CONFIG_SMP */
9835 diff -urN linux-2.4.22.org/kernel/signal.c linux-2.4.22/kernel/signal.c
9836 --- linux-2.4.22.org/kernel/signal.c 2003-11-24 18:28:15.000000000 +0100
9837 +++ linux-2.4.22/kernel/signal.c 2003-11-24 18:39:03.000000000 +0100
9838 @@ -507,12 +507,9 @@
9839 * process of changing - but no harm is done by that
9840 * other than doing an extra (lightweight) IPI interrupt.
9842 - spin_lock(&runqueue_lock);
9843 - if (task_has_cpu(t) && t->processor != smp_processor_id())
9844 - smp_send_reschedule(t->processor);
9845 - spin_unlock(&runqueue_lock);
9846 -#endif /* CONFIG_SMP */
9848 + if ((t->state == TASK_RUNNING) && (t->cpu != cpu()))
9849 + kick_if_running(t);
9851 if (t->state & TASK_INTERRUPTIBLE) {
9854 diff -urN linux-2.4.22.org/kernel/softirq.c linux-2.4.22/kernel/softirq.c
9855 --- linux-2.4.22.org/kernel/softirq.c 2003-11-24 18:28:15.000000000 +0100
9856 +++ linux-2.4.22/kernel/softirq.c 2003-11-24 18:39:03.000000000 +0100
9857 @@ -364,13 +364,13 @@
9858 int cpu = cpu_logical_map(bind_cpu);
9861 - current->nice = 19;
9862 + set_user_nice(current, 19);
9863 sigfillset(¤t->blocked);
9865 /* Migrate to the right CPU */
9866 - current->cpus_allowed = 1UL << cpu;
9867 - while (smp_processor_id() != cpu)
9869 + set_cpus_allowed(current, 1UL << cpu);
9873 sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu);
9879 -static __init int spawn_ksoftirqd(void)
9880 +__init int spawn_ksoftirqd(void)
9884 diff -urN linux-2.4.22.org/kernel/sys.c linux-2.4.22/kernel/sys.c
9885 --- linux-2.4.22.org/kernel/sys.c 2003-11-24 18:28:15.000000000 +0100
9886 +++ linux-2.4.22/kernel/sys.c 2003-11-24 18:39:03.000000000 +0100
9887 @@ -239,10 +239,10 @@
9889 if (error == -ESRCH)
9891 - if (niceval < p->nice && !capable(CAP_SYS_NICE))
9892 + if (niceval < task_nice(p) && !capable(CAP_SYS_NICE))
9895 - p->nice = niceval;
9896 + set_user_nice(p, niceval);
9898 read_unlock(&tasklist_lock);
9902 if (!proc_sel(p, which, who))
9904 - niceval = 20 - p->nice;
9905 + niceval = 20 - task_nice(p);
9906 if (niceval > retval)
9909 diff -urN linux-2.4.22.org/kernel/timer.c linux-2.4.22/kernel/timer.c
9910 --- linux-2.4.22.org/kernel/timer.c 2003-11-24 18:28:15.000000000 +0100
9911 +++ linux-2.4.22/kernel/timer.c 2003-11-24 18:39:03.000000000 +0100
9914 #include <asm/uaccess.h>
9916 +struct kernel_stat kstat;
9919 * Timekeeping variables
9921 @@ -598,25 +600,7 @@
9922 int cpu = smp_processor_id(), system = user_tick ^ 1;
9924 update_one_process(p, user_tick, system, cpu);
9926 - if (--p->counter <= 0) {
9929 - * SCHED_FIFO is priority preemption, so this is
9930 - * not the place to decide whether to reschedule a
9931 - * SCHED_FIFO task or not - Bhavesh Davda
9933 - if (p->policy != SCHED_FIFO) {
9934 - p->need_resched = 1;
9938 - kstat.per_cpu_nice[cpu] += user_tick;
9940 - kstat.per_cpu_user[cpu] += user_tick;
9941 - kstat.per_cpu_system[cpu] += system;
9942 - } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
9943 - kstat.per_cpu_system[cpu] += system;
9944 + scheduler_tick(user_tick, system);
9948 @@ -624,17 +608,7 @@
9950 static unsigned long count_active_tasks(void)
9952 - struct task_struct *p;
9953 - unsigned long nr = 0;
9955 - read_lock(&tasklist_lock);
9956 - for_each_task(p) {
9957 - if ((p->state == TASK_RUNNING ||
9958 - (p->state & TASK_UNINTERRUPTIBLE)))
9961 - read_unlock(&tasklist_lock);
9963 + return (nr_running() + nr_uninterruptible()) * FIXED_1;
9967 @@ -827,6 +801,89 @@
9971 +static void process_timeout(unsigned long __data)
9973 + wake_up_process((task_t *)__data);
9977 + * schedule_timeout - sleep until timeout
9978 + * @timeout: timeout value in jiffies
9980 + * Make the current task sleep until @timeout jiffies have
9981 + * elapsed. The routine will return immediately unless
9982 + * the current task state has been set (see set_current_state()).
9984 + * You can set the task state as follows -
9986 + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
9987 + * pass before the routine returns. The routine will return 0
9989 + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
9990 + * delivered to the current task. In this case the remaining time
9991 + * in jiffies will be returned, or 0 if the timer expired in time
9993 + * The current task state is guaranteed to be TASK_RUNNING when this
9994 + * routine returns.
9996 + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
9997 + * the CPU away without a bound on the timeout. In this case the return
9998 + * value will be %MAX_SCHEDULE_TIMEOUT.
10000 + * In all cases the return value is guaranteed to be non-negative.
10002 +signed long schedule_timeout(signed long timeout)
10004 + struct timer_list timer;
10005 + unsigned long expire;
10009 + case MAX_SCHEDULE_TIMEOUT:
10011 + * These two special cases are useful to be comfortable
10012 + * in the caller. Nothing more. We could take
10013 + * MAX_SCHEDULE_TIMEOUT from one of the negative value
10014 + * but I' d like to return a valid offset (>=0) to allow
10015 + * the caller to do everything it want with the retval.
10021 + * Another bit of PARANOID. Note that the retval will be
10022 + * 0 since no piece of kernel is supposed to do a check
10023 + * for a negative retval of schedule_timeout() (since it
10024 + * should never happens anyway). You just have the printk()
10025 + * that will tell you if something is gone wrong and where.
10029 + printk(KERN_ERR "schedule_timeout: wrong timeout "
10030 + "value %lx from %p\n", timeout,
10031 + __builtin_return_address(0));
10032 + current->state = TASK_RUNNING;
10037 + expire = timeout + jiffies;
10039 + init_timer(&timer);
10040 + timer.expires = expire;
10041 + timer.data = (unsigned long) current;
10042 + timer.function = process_timeout;
10044 + add_timer(&timer);
10046 + del_timer_sync(&timer);
10048 + timeout = expire - jiffies;
10051 + return timeout < 0 ? 0 : timeout;
10054 /* Thread ID - the internal kernel "pid" */
10055 asmlinkage long sys_gettid(void)
10057 @@ -873,4 +930,3 @@
10062 diff -urN linux-2.4.22.org/mm/oom_kill.c linux-2.4.22/mm/oom_kill.c
10063 --- linux-2.4.22.org/mm/oom_kill.c 2003-11-24 18:28:16.000000000 +0100
10064 +++ linux-2.4.22/mm/oom_kill.c 2003-11-24 18:39:03.000000000 +0100
10066 * Niced processes are most likely less important, so double
10067 * their badness points.
10070 + if (task_nice(p) > 0)
10074 @@ -152,7 +152,7 @@
10075 * all the memory it needs. That way it should be able to
10076 * exit() and clear out its resources quickly...
10078 - p->counter = 5 * HZ;
10079 + p->time_slice = HZ;
10081 /* This process has hardware access, be more careful. */
10082 if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) {
10083 diff -urN linux-2.4.22.org/net/bluetooth/bnep/core.c linux-2.4.22/net/bluetooth/bnep/core.c
10084 --- linux-2.4.22.org/net/bluetooth/bnep/core.c 2003-11-24 18:28:39.000000000 +0100
10085 +++ linux-2.4.22/net/bluetooth/bnep/core.c 2003-11-24 18:39:03.000000000 +0100
10086 @@ -460,7 +460,7 @@
10087 sigfillset(¤t->blocked);
10088 flush_signals(current);
10090 - current->nice = -15;
10091 + set_user_nice(current, -15);
10095 diff -urN linux-2.4.22.org/net/bluetooth/cmtp/core.c linux-2.4.22/net/bluetooth/cmtp/core.c
10096 --- linux-2.4.22.org/net/bluetooth/cmtp/core.c 2003-11-24 18:28:38.000000000 +0100
10097 +++ linux-2.4.22/net/bluetooth/cmtp/core.c 2003-11-24 18:39:03.000000000 +0100
10098 @@ -298,7 +298,7 @@
10099 sigfillset(¤t->blocked);
10100 flush_signals(current);
10102 - current->nice = -15;
10103 + set_user_nice(current, -15);