1 Scalable timer implementation. Lock per-CPU instead of global.
2 Author: Ingo Molnar <mingo@elte.hu>
3 URL: http://redhat.com/~mingo/scalable-timers-patches/
5 --- linux/kernel/timer.c.orig Tue Nov 27 13:29:34 2001
6 +++ linux/kernel/timer.c Tue Nov 27 13:30:37 2001
8 * serialize accesses to xtime/lost_ticks).
9 * Copyright (C) 1998 Andrea Arcangeli
10 * 1999-03-10 Improved NTP compatibility by Ulrich Windl
11 + * 2000-10-05 Implemented scalable SMP per-CPU timer handling.
12 + * Copyright (C) 2000 Ingo Molnar
13 + * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
16 #include <linux/config.h>
19 +#include <linux/init.h>
20 #include <linux/timex.h>
21 #include <linux/delay.h>
22 #include <linux/smp_lock.h>
24 unsigned long prof_len;
25 unsigned long prof_shift;
32 -#define TVN_SIZE (1 << TVN_BITS)
33 -#define TVR_SIZE (1 << TVR_BITS)
34 -#define TVN_MASK (TVN_SIZE - 1)
35 -#define TVR_MASK (TVR_SIZE - 1)
39 - struct list_head vec[TVN_SIZE];
42 -struct timer_vec_root {
44 - struct list_head vec[TVR_SIZE];
47 -static struct timer_vec tv5;
48 -static struct timer_vec tv4;
49 -static struct timer_vec tv3;
50 -static struct timer_vec tv2;
51 -static struct timer_vec_root tv1;
53 -static struct timer_vec * const tvecs[] = {
54 - (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5
57 -static struct list_head * run_timer_list_running;
59 -#define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))
61 -void init_timervecs (void)
64 +tvec_base_t tvec_bases[NR_CPUS];
66 - for (i = 0; i < TVN_SIZE; i++) {
67 - INIT_LIST_HEAD(tv5.vec + i);
68 - INIT_LIST_HEAD(tv4.vec + i);
69 - INIT_LIST_HEAD(tv3.vec + i);
70 - INIT_LIST_HEAD(tv2.vec + i);
72 - for (i = 0; i < TVR_SIZE; i++)
73 - INIT_LIST_HEAD(tv1.vec + i);
75 +/* jiffies at the most recent update of wall time */
76 +unsigned long wall_jiffies;
78 -static unsigned long timer_jiffies;
80 + * This spinlock protect us from races in SMP while playing with xtime. -arca
82 +rwlock_t xtime_lock = RW_LOCK_UNLOCKED;
84 -static inline void internal_add_timer(struct timer_list *timer)
86 + * This is the 'global' timer BH. This gets called only if one of
87 + * the local timer interrupts couldnt run timers.
89 +static inline void internal_add_timer(tvec_base_t *base, timer_t *timer)
92 * must be cli-ed when calling this
94 unsigned long expires = timer->expires;
95 - unsigned long idx = expires - timer_jiffies;
96 + unsigned long idx = expires - base->timer_jiffies;
97 struct list_head * vec;
99 - if (run_timer_list_running)
100 - vec = run_timer_list_running;
101 - else if (idx < TVR_SIZE) {
102 + if (idx < TVR_SIZE) {
103 int i = expires & TVR_MASK;
105 + vec = base->tv1.vec + i;
106 } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
107 int i = (expires >> TVR_BITS) & TVN_MASK;
109 + vec = base->tv2.vec + i;
110 } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
111 int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
113 + vec = base->tv3.vec + i;
114 } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
115 int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
117 + vec = base->tv4.vec + i;
118 } else if ((signed long) idx < 0) {
119 /* can happen if you add a timer with expires == jiffies,
120 * or you set a timer to go off in the past
122 - vec = tv1.vec + tv1.index;
123 + vec = base->tv1.vec + base->tv1.index;
124 } else if (idx <= 0xffffffffUL) {
125 int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
127 + vec = base->tv5.vec + i;
129 /* Can only get here on architectures with 64-bit jiffies */
130 INIT_LIST_HEAD(&timer->list);
131 @@ -165,37 +136,27 @@
132 list_add(&timer->list, vec->prev);
135 -/* Initialize both explicitly - let's try to have them in the same cache line */
136 -spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED;
139 -volatile struct timer_list * volatile running_timer;
140 -#define timer_enter(t) do { running_timer = t; mb(); } while (0)
141 -#define timer_exit() do { running_timer = NULL; } while (0)
142 -#define timer_is_running(t) (running_timer == t)
143 -#define timer_synchronize(t) while (timer_is_running(t)) barrier()
145 -#define timer_enter(t) do { } while (0)
146 -#define timer_exit() do { } while (0)
149 -void add_timer(struct timer_list *timer)
150 +void add_timer(timer_t *timer)
152 + tvec_base_t * base = tvec_bases + smp_processor_id();
155 - spin_lock_irqsave(&timerlist_lock, flags);
157 + CHECK_BASE(timer->base);
158 + spin_lock_irqsave(&base->lock, flags);
159 if (timer_pending(timer))
161 - internal_add_timer(timer);
162 - spin_unlock_irqrestore(&timerlist_lock, flags);
163 + internal_add_timer(base, timer);
164 + timer->base = base;
165 + spin_unlock_irqrestore(&base->lock, flags);
168 - spin_unlock_irqrestore(&timerlist_lock, flags);
169 + spin_unlock_irqrestore(&base->lock, flags);
170 printk("bug: kernel timer added twice at %p.\n",
171 __builtin_return_address(0));
174 -static inline int detach_timer (struct timer_list *timer)
175 +static inline int detach_timer(timer_t *timer)
177 if (!timer_pending(timer))
179 @@ -203,28 +164,81 @@
183 -int mod_timer(struct timer_list *timer, unsigned long expires)
185 + * mod_timer() has subtle locking semantics because parallel
186 + * calls to it must happen serialized.
188 +int mod_timer(timer_t *timer, unsigned long expires)
191 + tvec_base_t *old_base, *new_base;
195 + new_base = tvec_bases + smp_processor_id();
196 + CHECK_BASE(new_base);
198 + __save_flags(flags);
201 + old_base = timer->base;
202 + CHECK_BASE(old_base);
205 + * Prevent deadlocks via ordering by old_base < new_base.
207 + if (old_base && (new_base != old_base)) {
208 + if (old_base < new_base) {
209 + spin_lock(&new_base->lock);
210 + spin_lock(&old_base->lock);
212 + spin_lock(&old_base->lock);
213 + spin_lock(&new_base->lock);
216 + * Subtle, we rely on timer->base being always
217 + * valid and being updated atomically.
219 + if (timer->base != old_base) {
220 + spin_unlock(&new_base->lock);
221 + spin_unlock(&old_base->lock);
225 + spin_lock(&new_base->lock);
227 - spin_lock_irqsave(&timerlist_lock, flags);
228 timer->expires = expires;
229 ret = detach_timer(timer);
230 - internal_add_timer(timer);
231 - spin_unlock_irqrestore(&timerlist_lock, flags);
232 + internal_add_timer(new_base, timer);
233 + timer->base = new_base;
236 + if (old_base && (new_base != old_base))
237 + spin_unlock(&old_base->lock);
238 + spin_unlock_irqrestore(&new_base->lock, flags);
243 -int del_timer(struct timer_list * timer)
244 +int del_timer(timer_t * timer)
248 + tvec_base_t * base;
251 - spin_lock_irqsave(&timerlist_lock, flags);
252 + CHECK_BASE(timer->base);
256 + base = timer->base;
257 + spin_lock_irqsave(&base->lock, flags);
258 + if (base != timer->base) {
259 + spin_unlock_irqrestore(&base->lock, flags);
262 ret = detach_timer(timer);
263 timer->list.next = timer->list.prev = NULL;
264 - spin_unlock_irqrestore(&timerlist_lock, flags);
265 + spin_unlock_irqrestore(&base->lock, flags);
270 @@ -242,24 +256,34 @@
271 * (for reference counting).
274 -int del_timer_sync(struct timer_list * timer)
275 +int del_timer_sync(timer_t * timer)
277 + tvec_base_t * base;
280 + CHECK_BASE(timer->base);
287 - spin_lock_irqsave(&timerlist_lock, flags);
289 + base = timer->base;
290 + spin_lock_irqsave(&base->lock, flags);
291 + if (base != timer->base) {
292 + spin_unlock_irqrestore(&base->lock, flags);
295 ret += detach_timer(timer);
296 timer->list.next = timer->list.prev = 0;
297 - running = timer_is_running(timer);
298 - spin_unlock_irqrestore(&timerlist_lock, flags);
299 + running = timer_is_running(base, timer);
300 + spin_unlock_irqrestore(&base->lock, flags);
305 - timer_synchronize(timer);
306 + timer_synchronize(base, timer);
314 -static inline void cascade_timers(struct timer_vec *tv)
315 +static void cascade(tvec_base_t *base, tvec_t *tv)
317 /* cascade all the timers from tv up one level */
318 struct list_head *head, *curr, *next;
319 @@ -279,66 +303,68 @@
320 * detach them individually, just clear the list afterwards.
322 while (curr != head) {
323 - struct timer_list *tmp;
326 - tmp = list_entry(curr, struct timer_list, list);
327 + tmp = list_entry(curr, timer_t, list);
328 + CHECK_BASE(tmp->base);
329 + if (tmp->base != base)
332 list_del(curr); // not needed
333 - internal_add_timer(tmp);
334 + internal_add_timer(base, tmp);
337 INIT_LIST_HEAD(head);
338 tv->index = (tv->index + 1) & TVN_MASK;
341 -static inline void run_timer_list(void)
342 +static void __run_timers(tvec_base_t *base)
344 - spin_lock_irq(&timerlist_lock);
345 - while ((long)(jiffies - timer_jiffies) >= 0) {
347 + unsigned long flags;
349 + spin_lock_irqsave(&base->lock, flags);
350 + while ((long)(jiffies - base->timer_jiffies) >= 0) {
351 struct list_head *head, *curr;
355 - cascade_timers(tvecs[n]);
356 - } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS);
361 + if (!base->tv1.index) {
362 + cascade(base, &base->tv2);
363 + if (base->tv2.index == 1) {
364 + cascade(base, &base->tv3);
365 + if (base->tv3.index == 1) {
366 + cascade(base, &base->tv4);
367 + if (base->tv4.index == 1)
368 + cascade(base, &base->tv5);
372 - run_timer_list_running = &queued;
374 - head = tv1.vec + tv1.index;
375 + head = base->tv1.vec + base->tv1.index;
378 - struct timer_list *timer;
379 void (*fn)(unsigned long);
383 - timer = list_entry(curr, struct timer_list, list);
384 + timer = list_entry(curr, timer_t, list);
385 fn = timer->function;
387 + data = timer->data;
390 timer->list.next = timer->list.prev = NULL;
391 - timer_enter(timer);
392 - spin_unlock_irq(&timerlist_lock);
393 + timer_enter(base, timer);
394 + spin_unlock_irq(&base->lock);
396 - spin_lock_irq(&timerlist_lock);
398 + spin_lock_irq(&base->lock);
402 - run_timer_list_running = NULL;
404 - tv1.index = (tv1.index + 1) & TVR_MASK;
406 - curr = queued.next;
407 - while (curr != &queued) {
408 - struct timer_list *timer;
410 - timer = list_entry(curr, struct timer_list, list);
412 - internal_add_timer(timer);
414 + ++base->timer_jiffies;
415 + base->tv1.index = (base->tv1.index + 1) & TVR_MASK;
417 - spin_unlock_irq(&timerlist_lock);
418 + spin_unlock_irqrestore(&base->lock, flags);
421 spinlock_t tqueue_lock = SPIN_LOCK_UNLOCKED;
422 @@ -632,42 +671,76 @@
426 -/* jiffies at the most recent update of wall time */
427 -unsigned long wall_jiffies;
428 +static void run_all_timers(void)
432 + for (i = 0; i < smp_num_cpus; i++) {
433 + tvec_base_t *base = tvec_bases + i;
434 + if ((long)(jiffies - base->timer_jiffies) >= 0)
435 + __run_timers(base);
440 - * This spinlock protect us from races in SMP while playing with xtime. -arca
441 + * Called by the local, per-CPU timer interrupt on SMP.
443 + * This function has to do all sorts of locking to make legacy
444 + * cli()-users and BH-disablers work. If locking doesnt succeed
445 + * now then we fall back to TIMER_BH.
447 -rwlock_t xtime_lock = RW_LOCK_UNLOCKED;
448 +void run_local_timers(void)
450 + int cpu = smp_processor_id();
451 + tvec_base_t *base = tvec_bases + cpu;
453 + if (in_interrupt())
456 + local_bh_disable();
457 + local_irq_disable();
458 + if (!spin_trylock(&global_bh_lock))
459 + goto out_enable_mark;
461 + if (!hardirq_trylock(cpu))
462 + goto out_unlock_enable_mark;
464 -static inline void update_times(void)
465 + if ((long)(jiffies - base->timer_jiffies) >= 0)
466 + __run_timers(base);
468 + hardirq_endlock(cpu);
469 + spin_unlock(&global_bh_lock);
470 + local_irq_enable();
474 +out_unlock_enable_mark:
475 + spin_unlock(&global_bh_lock);
478 + local_irq_enable();
486 + * Called by the timer interrupt. xtime_lock must already be taken
487 + * by the timer IRQ!
489 +static void update_times(void)
494 - * update_times() is run from the raw timer_bh handler so we
495 - * just know that the irqs are locally enabled and so we don't
496 - * need to save/restore the flags of the local CPU here. -arca
498 - write_lock_irq(&xtime_lock);
501 ticks = jiffies - wall_jiffies;
503 wall_jiffies += ticks;
504 update_wall_time(ticks);
507 - write_unlock_irq(&xtime_lock);
517 void do_timer(struct pt_regs *regs)
519 (*(unsigned long *)&jiffies)++;
521 /* SMP process accounting uses the local APIC timer */
523 update_process_times(user_mode(regs));
529 + * Right now only x86-SMP calls run_local_timers() from a
530 + * per-CPU interrupt.
536 if (TQ_ACTIVE(tq_timer))
539 @@ -938,3 +1022,23 @@
544 +void __init init_timers(void)
548 + for (i = 0; i < NR_CPUS; i++) {
549 + tvec_base_t *base = tvec_bases + i;
551 + spin_lock_init(&base->lock);
552 + for (j = 0; j < TVN_SIZE; j++) {
553 + INIT_LIST_HEAD(base->tv5.vec + j);
554 + INIT_LIST_HEAD(base->tv4.vec + j);
555 + INIT_LIST_HEAD(base->tv3.vec + j);
556 + INIT_LIST_HEAD(base->tv2.vec + j);
558 + for (j = 0; j < TVR_SIZE; j++)
559 + INIT_LIST_HEAD(base->tv1.vec + j);
561 + init_bh(TIMER_BH, run_all_timers);
563 --- linux/kernel/sched.c.orig Tue Nov 27 13:29:39 2001
564 +++ linux/kernel/sched.c Tue Nov 27 13:30:37 2001
565 @@ -1749,7 +1749,6 @@
566 idle->preempt_count = (idle->lock_depth >= 0);
569 -extern void init_timervecs(void);
570 extern void timer_bh(void);
571 extern void tqueue_bh(void);
572 extern void immediate_bh(void);
573 @@ -1790,8 +1789,7 @@
574 current->cpu = smp_processor_id();
575 wake_up_process(current);
578 - init_bh(TIMER_BH, timer_bh);
580 init_bh(TQUEUE_BH, tqueue_bh);
581 init_bh(IMMEDIATE_BH, immediate_bh);
583 --- linux/kernel/ksyms.c.orig Tue Nov 27 13:29:39 2001
584 +++ linux/kernel/ksyms.c Tue Nov 27 13:30:37 2001
586 EXPORT_SYMBOL(del_timer_sync);
588 EXPORT_SYMBOL(mod_timer);
589 +EXPORT_SYMBOL(tvec_bases);
590 EXPORT_SYMBOL(tq_timer);
591 EXPORT_SYMBOL(tq_immediate);
593 --- linux/include/linux/timer.h.orig Tue Nov 27 13:29:37 2001
594 +++ linux/include/linux/timer.h Tue Nov 27 13:30:37 2001
596 #ifndef _LINUX_TIMER_H
597 #define _LINUX_TIMER_H
599 -#include <linux/config.h>
600 -#include <linux/list.h>
603 * In Linux 2.4, static timers have been removed from the kernel.
604 * Timers may be dynamically created and destroyed, and should be initialized
606 * timeouts. You can use this field to distinguish between the different
610 +#include <linux/config.h>
611 +#include <linux/smp.h>
612 +#include <linux/list.h>
613 +#include <linux/spinlock.h>
614 +#include <linux/threads.h>
621 +#define TVN_SIZE (1 << TVN_BITS)
622 +#define TVR_SIZE (1 << TVR_BITS)
623 +#define TVN_MASK (TVN_SIZE - 1)
624 +#define TVR_MASK (TVR_SIZE - 1)
626 +typedef struct tvec_s {
628 + struct list_head vec[TVN_SIZE];
631 +typedef struct tvec_root_s {
633 + struct list_head vec[TVR_SIZE];
636 +#define NOOF_TVECS 5
638 +typedef struct timer_list timer_t;
640 +typedef struct tvec_t_base_s {
642 + unsigned long timer_jiffies;
643 + volatile timer_t * volatile running_timer;
652 + * This is the new and improved way of handling timers.
654 + * The "data" field is in case you want to use the same
655 + * timeout function for several timeouts. You can use this
656 + * to distinguish between the different invocations.
659 struct list_head list;
660 unsigned long expires;
662 void (*function)(unsigned long);
666 -extern void add_timer(struct timer_list * timer);
667 -extern int del_timer(struct timer_list * timer);
668 +extern void add_timer(timer_t * timer);
669 +extern int del_timer(timer_t * timer);
672 -extern int del_timer_sync(struct timer_list * timer);
673 +extern int del_timer_sync(timer_t * timer);
674 extern void sync_timers(void);
675 +#define timer_enter(base, t) do { base->running_timer = t; mb(); } while (0)
676 +#define timer_exit(base) do { base->running_timer = NULL; } while (0)
677 +#define timer_is_running(base,t) (base->running_timer == t)
678 +#define timer_synchronize(base,t) while (timer_is_running(base,t)) barrier()
680 #define del_timer_sync(t) del_timer(t)
681 #define sync_timers() do { } while (0)
682 +#define timer_enter(base,t) do { } while (0)
683 +#define timer_exit(base) do { } while (0)
688 * If the timer is known to be not pending (ie, in the handler), mod_timer
689 * is less efficient than a->expires = b; add_timer(a).
691 -int mod_timer(struct timer_list *timer, unsigned long expires);
692 +int mod_timer(timer_t *timer, unsigned long expires);
694 extern void it_real_fn(unsigned long);
696 -static inline void init_timer(struct timer_list * timer)
697 +extern void init_timers(void);
698 +extern void run_local_timers(void);
700 +extern tvec_base_t tvec_bases[NR_CPUS];
702 +static inline void init_timer(timer_t * timer)
704 timer->list.next = timer->list.prev = NULL;
705 + timer->base = tvec_bases + 0;
708 -static inline int timer_pending (const struct timer_list * timer)
709 +#define TIMER_DEBUG 0
711 +# define CHECK_BASE(base) \
712 + if (base && ((base < tvec_bases) || (base >= tvec_bases + NR_CPUS))) \
715 +# define CHECK_BASE(base)
718 +static inline int timer_pending(const timer_t * timer)
720 + CHECK_BASE(timer->base);
721 return timer->list.next != NULL;
724 --- linux/include/linux/smp.h.orig Sun Dec 31 20:10:17 2000
725 +++ linux/include/linux/smp.h Tue Nov 27 13:30:37 2001
728 * These macros fold the SMP functionality into a single CPU system
733 #define smp_num_cpus 1
734 #define smp_processor_id() 0
735 #define hard_smp_processor_id() 0
736 --- linux/drivers/net/eepro100.c.orig Tue Nov 27 13:29:38 2001
737 +++ linux/drivers/net/eepro100.c Tue Nov 27 13:30:37 2001
738 @@ -1219,9 +1219,6 @@
739 /* We must continue to monitor the media. */
740 sp->timer.expires = RUN_AT(2*HZ); /* 2.0 sec. */
741 add_timer(&sp->timer);
742 -#if defined(timer_exit)
743 - timer_exit(&sp->timer);
747 static void speedo_show_state(struct net_device *dev)
748 --- linux/arch/i386/mm/fault.c.orig Tue Nov 27 13:29:31 2001
749 +++ linux/arch/i386/mm/fault.c Tue Nov 27 13:30:37 2001
750 @@ -104,16 +104,12 @@
754 -extern spinlock_t timerlist_lock;
757 * Unlock any spinlocks which will prevent us from getting the
758 - * message out (timerlist_lock is acquired through the
759 - * console unblank code)
762 void bust_spinlocks(int yes)
764 - spin_lock_init(&timerlist_lock);
766 oops_in_progress = 1;
768 --- linux/arch/i386/kernel/apic.c.orig Tue Nov 27 13:29:38 2001
769 +++ linux/arch/i386/kernel/apic.c Tue Nov 27 13:30:37 2001
770 @@ -1078,7 +1078,9 @@
772 smp_local_timer_interrupt(regs);
776 + run_local_timers();
778 if (softirq_pending(cpu))
781 --- linux-2.4.20/lib/bust_spinlocks.c.orig Mon Sep 17 06:22:40 2001
782 +++ linux-2.4.20/lib/bust_spinlocks.c Sun Mar 9 13:26:33 2003
784 #include <linux/wait.h>
785 #include <linux/vt_kern.h>
787 -extern spinlock_t timerlist_lock;
789 void bust_spinlocks(int yes)
791 - spin_lock_init(&timerlist_lock);
793 oops_in_progress = 1;