1 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/alpha/include/asm/spinlock_types.h linux-4.14/arch/alpha/include/asm/spinlock_types.h
2 --- linux-4.14.orig/arch/alpha/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
3 +++ linux-4.14/arch/alpha/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
5 #ifndef _ALPHA_SPINLOCK_TYPES_H
6 #define _ALPHA_SPINLOCK_TYPES_H
8 -#ifndef __LINUX_SPINLOCK_TYPES_H
9 -# error "please don't include this file directly"
13 volatile unsigned int lock;
15 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/include/asm/irq.h linux-4.14/arch/arm/include/asm/irq.h
16 --- linux-4.14.orig/arch/arm/include/asm/irq.h 2017-11-12 19:46:13.000000000 +0100
17 +++ linux-4.14/arch/arm/include/asm/irq.h 2018-09-05 11:05:07.000000000 +0200
22 +#include <linux/cpumask.h>
26 extern void migrate_irqs(void);
27 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/include/asm/spinlock_types.h linux-4.14/arch/arm/include/asm/spinlock_types.h
28 --- linux-4.14.orig/arch/arm/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
29 +++ linux-4.14/arch/arm/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
31 #ifndef __ASM_SPINLOCK_TYPES_H
32 #define __ASM_SPINLOCK_TYPES_H
34 -#ifndef __LINUX_SPINLOCK_TYPES_H
35 -# error "please don't include this file directly"
38 #define TICKET_SHIFT 16
41 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/include/asm/switch_to.h linux-4.14/arch/arm/include/asm/switch_to.h
42 --- linux-4.14.orig/arch/arm/include/asm/switch_to.h 2017-11-12 19:46:13.000000000 +0100
43 +++ linux-4.14/arch/arm/include/asm/switch_to.h 2018-09-05 11:05:07.000000000 +0200
46 #include <linux/thread_info.h>
48 +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
49 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
52 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
56 * For v7 SMP cores running a preemptible kernel we may be pre-empted
57 * during a TLB maintenance operation, so execute an inner-shareable dsb
59 #define switch_to(prev,next,last) \
61 __complete_pending_tlbi(); \
62 + switch_kmaps(prev, next); \
63 last = __switch_to(prev,task_thread_info(prev), task_thread_info(next)); \
66 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/include/asm/thread_info.h linux-4.14/arch/arm/include/asm/thread_info.h
67 --- linux-4.14.orig/arch/arm/include/asm/thread_info.h 2017-11-12 19:46:13.000000000 +0100
68 +++ linux-4.14/arch/arm/include/asm/thread_info.h 2018-09-05 11:05:07.000000000 +0200
71 unsigned long flags; /* low level flags */
72 int preempt_count; /* 0 => preemptable, <0 => bug */
73 + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */
74 mm_segment_t addr_limit; /* address limit */
75 struct task_struct *task; /* main task structure */
78 #define TIF_SYSCALL_TRACE 4 /* syscall trace active */
79 #define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */
80 #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */
81 -#define TIF_SECCOMP 7 /* seccomp syscall filtering active */
82 +#define TIF_SECCOMP 8 /* seccomp syscall filtering active */
83 +#define TIF_NEED_RESCHED_LAZY 7
85 #define TIF_NOHZ 12 /* in adaptive nohz mode */
86 #define TIF_USING_IWMMXT 17
88 #define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
89 #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
90 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
91 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
92 #define _TIF_UPROBE (1 << TIF_UPROBE)
93 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
94 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
96 * Change these and you break ASM code in entry-common.S
98 #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
99 - _TIF_NOTIFY_RESUME | _TIF_UPROBE)
100 + _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
101 + _TIF_NEED_RESCHED_LAZY)
103 #endif /* __KERNEL__ */
104 #endif /* __ASM_ARM_THREAD_INFO_H */
105 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/Kconfig linux-4.14/arch/arm/Kconfig
106 --- linux-4.14.orig/arch/arm/Kconfig 2017-11-12 19:46:13.000000000 +0100
107 +++ linux-4.14/arch/arm/Kconfig 2018-09-05 11:05:07.000000000 +0200
109 select HARDIRQS_SW_RESEND
110 select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
111 select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
112 - select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
113 + select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT_BASE
114 select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
115 select HAVE_ARCH_MMAP_RND_BITS if MMU
116 select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
118 select HAVE_PERF_EVENTS
119 select HAVE_PERF_REGS
120 select HAVE_PERF_USER_STACK_DUMP
121 + select HAVE_PREEMPT_LAZY
122 select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
123 select HAVE_REGS_AND_STACK_ACCESS_API
124 select HAVE_SYSCALL_TRACEPOINTS
125 @@ -2164,7 +2165,7 @@
127 config KERNEL_MODE_NEON
128 bool "Support for NEON in kernel mode"
129 - depends on NEON && AEABI
130 + depends on NEON && AEABI && !PREEMPT_RT_BASE
132 Say Y to include support for NEON in kernel mode.
134 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/asm-offsets.c linux-4.14/arch/arm/kernel/asm-offsets.c
135 --- linux-4.14.orig/arch/arm/kernel/asm-offsets.c 2017-11-12 19:46:13.000000000 +0100
136 +++ linux-4.14/arch/arm/kernel/asm-offsets.c 2018-09-05 11:05:07.000000000 +0200
139 DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
140 DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
141 + DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
142 DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit));
143 DEFINE(TI_TASK, offsetof(struct thread_info, task));
144 DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
145 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/entry-armv.S linux-4.14/arch/arm/kernel/entry-armv.S
146 --- linux-4.14.orig/arch/arm/kernel/entry-armv.S 2017-11-12 19:46:13.000000000 +0100
147 +++ linux-4.14/arch/arm/kernel/entry-armv.S 2018-09-05 11:05:07.000000000 +0200
148 @@ -220,11 +220,18 @@
150 #ifdef CONFIG_PREEMPT
151 ldr r8, [tsk, #TI_PREEMPT] @ get preempt count
152 - ldr r0, [tsk, #TI_FLAGS] @ get flags
153 teq r8, #0 @ if preempt count != 0
154 + bne 1f @ return from exeption
155 + ldr r0, [tsk, #TI_FLAGS] @ get flags
156 + tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set
157 + blne svc_preempt @ preempt!
159 + ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count
160 + teq r8, #0 @ if preempt lazy count != 0
161 movne r0, #0 @ force flags to 0
162 - tst r0, #_TIF_NEED_RESCHED
163 + tst r0, #_TIF_NEED_RESCHED_LAZY
168 svc_exit r5, irq = 1 @ return from exception
170 1: bl preempt_schedule_irq @ irq en/disable is done inside
171 ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS
172 tst r0, #_TIF_NEED_RESCHED
174 + tst r0, #_TIF_NEED_RESCHED_LAZY
177 + ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count
178 + teq r0, #0 @ if preempt lazy count != 0
185 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/entry-common.S linux-4.14/arch/arm/kernel/entry-common.S
186 --- linux-4.14.orig/arch/arm/kernel/entry-common.S 2017-11-12 19:46:13.000000000 +0100
187 +++ linux-4.14/arch/arm/kernel/entry-common.S 2018-09-05 11:05:07.000000000 +0200
190 blne addr_limit_check_failed
191 ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
192 - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
193 + tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
194 + bne fast_work_pending
195 + tst r1, #_TIF_SECCOMP
196 bne fast_work_pending
201 blne addr_limit_check_failed
202 ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
203 - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
204 + tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
206 + tst r1, #_TIF_SECCOMP
210 ENDPROC(ret_fast_syscall)
212 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/patch.c linux-4.14/arch/arm/kernel/patch.c
213 --- linux-4.14.orig/arch/arm/kernel/patch.c 2017-11-12 19:46:13.000000000 +0100
214 +++ linux-4.14/arch/arm/kernel/patch.c 2018-09-05 11:05:07.000000000 +0200
219 -static DEFINE_SPINLOCK(patch_lock);
220 +static DEFINE_RAW_SPINLOCK(patch_lock);
222 static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
223 __acquires(&patch_lock)
228 - spin_lock_irqsave(&patch_lock, *flags);
229 + raw_spin_lock_irqsave(&patch_lock, *flags);
231 __acquire(&patch_lock);
234 clear_fixmap(fixmap);
237 - spin_unlock_irqrestore(&patch_lock, *flags);
238 + raw_spin_unlock_irqrestore(&patch_lock, *flags);
240 __release(&patch_lock);
242 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/process.c linux-4.14/arch/arm/kernel/process.c
243 --- linux-4.14.orig/arch/arm/kernel/process.c 2017-11-12 19:46:13.000000000 +0100
244 +++ linux-4.14/arch/arm/kernel/process.c 2018-09-05 11:05:07.000000000 +0200
250 + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock. If the lock is not
251 + * initialized by pgtable_page_ctor() then a coredump of the vector page will
254 +static int __init vectors_user_mapping_init_page(void)
257 + unsigned long addr = 0xffff0000;
262 + pgd = pgd_offset_k(addr);
263 + pud = pud_offset(pgd, addr);
264 + pmd = pmd_offset(pud, addr);
265 + page = pmd_page(*(pmd));
267 + pgtable_page_ctor(page);
271 +late_initcall(vectors_user_mapping_init_page);
273 #ifdef CONFIG_KUSER_HELPERS
275 * The vectors page is always readable from user space for the
276 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/signal.c linux-4.14/arch/arm/kernel/signal.c
277 --- linux-4.14.orig/arch/arm/kernel/signal.c 2017-11-12 19:46:13.000000000 +0100
278 +++ linux-4.14/arch/arm/kernel/signal.c 2018-09-05 11:05:07.000000000 +0200
281 trace_hardirqs_off();
283 - if (likely(thread_flags & _TIF_NEED_RESCHED)) {
284 + if (likely(thread_flags & (_TIF_NEED_RESCHED |
285 + _TIF_NEED_RESCHED_LAZY))) {
288 if (unlikely(!user_mode(regs)))
289 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/smp.c linux-4.14/arch/arm/kernel/smp.c
290 --- linux-4.14.orig/arch/arm/kernel/smp.c 2017-11-12 19:46:13.000000000 +0100
291 +++ linux-4.14/arch/arm/kernel/smp.c 2018-09-05 11:05:07.000000000 +0200
294 local_flush_tlb_all();
296 - clear_tasks_mm_cpumask(cpu);
303 pr_debug("CPU%u: shutdown\n", cpu);
305 + clear_tasks_mm_cpumask(cpu);
307 * platform_cpu_kill() is generally expected to do the powering off
308 * and/or cutting of clocks to the dying CPU. Optionally, this may
309 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/unwind.c linux-4.14/arch/arm/kernel/unwind.c
310 --- linux-4.14.orig/arch/arm/kernel/unwind.c 2017-11-12 19:46:13.000000000 +0100
311 +++ linux-4.14/arch/arm/kernel/unwind.c 2018-09-05 11:05:07.000000000 +0200
313 static const struct unwind_idx *__origin_unwind_idx;
314 extern const struct unwind_idx __stop_unwind_idx[];
316 -static DEFINE_SPINLOCK(unwind_lock);
317 +static DEFINE_RAW_SPINLOCK(unwind_lock);
318 static LIST_HEAD(unwind_tables);
320 /* Convert a prel31 symbol to an absolute address */
322 /* module unwind tables */
323 struct unwind_table *table;
325 - spin_lock_irqsave(&unwind_lock, flags);
326 + raw_spin_lock_irqsave(&unwind_lock, flags);
327 list_for_each_entry(table, &unwind_tables, list) {
328 if (addr >= table->begin_addr &&
329 addr < table->end_addr) {
334 - spin_unlock_irqrestore(&unwind_lock, flags);
335 + raw_spin_unlock_irqrestore(&unwind_lock, flags);
338 pr_debug("%s: idx = %p\n", __func__, idx);
340 tab->begin_addr = text_addr;
341 tab->end_addr = text_addr + text_size;
343 - spin_lock_irqsave(&unwind_lock, flags);
344 + raw_spin_lock_irqsave(&unwind_lock, flags);
345 list_add_tail(&tab->list, &unwind_tables);
346 - spin_unlock_irqrestore(&unwind_lock, flags);
347 + raw_spin_unlock_irqrestore(&unwind_lock, flags);
355 - spin_lock_irqsave(&unwind_lock, flags);
356 + raw_spin_lock_irqsave(&unwind_lock, flags);
357 list_del(&tab->list);
358 - spin_unlock_irqrestore(&unwind_lock, flags);
359 + raw_spin_unlock_irqrestore(&unwind_lock, flags);
363 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-exynos/platsmp.c linux-4.14/arch/arm/mach-exynos/platsmp.c
364 --- linux-4.14.orig/arch/arm/mach-exynos/platsmp.c 2017-11-12 19:46:13.000000000 +0100
365 +++ linux-4.14/arch/arm/mach-exynos/platsmp.c 2018-09-05 11:05:07.000000000 +0200
367 return (void __iomem *)(S5P_VA_SCU);
370 -static DEFINE_SPINLOCK(boot_lock);
371 +static DEFINE_RAW_SPINLOCK(boot_lock);
373 static void exynos_secondary_init(unsigned int cpu)
377 * Synchronise with the boot thread.
379 - spin_lock(&boot_lock);
380 - spin_unlock(&boot_lock);
381 + raw_spin_lock(&boot_lock);
382 + raw_spin_unlock(&boot_lock);
385 int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
387 * Set synchronisation state between this boot processor
388 * and the secondary one
390 - spin_lock(&boot_lock);
391 + raw_spin_lock(&boot_lock);
394 * The secondary processor is waiting to be released from
398 printk(KERN_ERR "cpu1 power enable failed");
399 - spin_unlock(&boot_lock);
400 + raw_spin_unlock(&boot_lock);
405 * calibrations, then wait for it to finish
408 - spin_unlock(&boot_lock);
409 + raw_spin_unlock(&boot_lock);
411 return pen_release != -1 ? ret : 0;
413 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-hisi/platmcpm.c linux-4.14/arch/arm/mach-hisi/platmcpm.c
414 --- linux-4.14.orig/arch/arm/mach-hisi/platmcpm.c 2017-11-12 19:46:13.000000000 +0100
415 +++ linux-4.14/arch/arm/mach-hisi/platmcpm.c 2018-09-05 11:05:07.000000000 +0200
418 static void __iomem *sysctrl, *fabric;
419 static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
420 -static DEFINE_SPINLOCK(boot_lock);
421 +static DEFINE_RAW_SPINLOCK(boot_lock);
422 static u32 fabric_phys_addr;
424 * [0]: bootwrapper physical address
426 if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
429 - spin_lock_irq(&boot_lock);
430 + raw_spin_lock_irq(&boot_lock);
432 if (hip04_cpu_table[cluster][cpu])
437 hip04_cpu_table[cluster][cpu]++;
438 - spin_unlock_irq(&boot_lock);
439 + raw_spin_unlock_irq(&boot_lock);
443 @@ -162,11 +162,11 @@
444 cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
445 cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
447 - spin_lock(&boot_lock);
448 + raw_spin_lock(&boot_lock);
449 hip04_cpu_table[cluster][cpu]--;
450 if (hip04_cpu_table[cluster][cpu] == 1) {
451 /* A power_up request went ahead of us. */
452 - spin_unlock(&boot_lock);
453 + raw_spin_unlock(&boot_lock);
455 } else if (hip04_cpu_table[cluster][cpu] > 1) {
456 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
460 last_man = hip04_cluster_is_down(cluster);
461 - spin_unlock(&boot_lock);
462 + raw_spin_unlock(&boot_lock);
464 /* Since it's Cortex A15, disable L2 prefetching. */
467 cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
469 count = TIMEOUT_MSEC / POLL_MSEC;
470 - spin_lock_irq(&boot_lock);
471 + raw_spin_lock_irq(&boot_lock);
472 for (tries = 0; tries < count; tries++) {
473 if (hip04_cpu_table[cluster][cpu])
475 @@ -211,10 +211,10 @@
476 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
477 if (data & CORE_WFI_STATUS(cpu))
479 - spin_unlock_irq(&boot_lock);
480 + raw_spin_unlock_irq(&boot_lock);
481 /* Wait for clean L2 when the whole cluster is down. */
483 - spin_lock_irq(&boot_lock);
484 + raw_spin_lock_irq(&boot_lock);
488 @@ -231,10 +231,10 @@
490 if (hip04_cluster_is_down(cluster))
491 hip04_set_snoop_filter(cluster, 0);
492 - spin_unlock_irq(&boot_lock);
493 + raw_spin_unlock_irq(&boot_lock);
496 - spin_unlock_irq(&boot_lock);
497 + raw_spin_unlock_irq(&boot_lock);
501 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-omap2/omap-smp.c linux-4.14/arch/arm/mach-omap2/omap-smp.c
502 --- linux-4.14.orig/arch/arm/mach-omap2/omap-smp.c 2018-09-05 11:03:20.000000000 +0200
503 +++ linux-4.14/arch/arm/mach-omap2/omap-smp.c 2018-09-05 11:05:07.000000000 +0200
505 .startup_addr = omap5_secondary_startup,
508 -static DEFINE_SPINLOCK(boot_lock);
509 +static DEFINE_RAW_SPINLOCK(boot_lock);
511 void __iomem *omap4_get_scu_base(void)
515 * Synchronise with the boot thread.
517 - spin_lock(&boot_lock);
518 - spin_unlock(&boot_lock);
519 + raw_spin_lock(&boot_lock);
520 + raw_spin_unlock(&boot_lock);
523 static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
525 * Set synchronisation state between this boot processor
526 * and the secondary one
528 - spin_lock(&boot_lock);
529 + raw_spin_lock(&boot_lock);
532 * Update the AuxCoreBoot0 with boot state for secondary core.
534 * Now the secondary core is starting up let it run its
535 * calibrations, then wait for it to finish
537 - spin_unlock(&boot_lock);
538 + raw_spin_unlock(&boot_lock);
542 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-prima2/platsmp.c linux-4.14/arch/arm/mach-prima2/platsmp.c
543 --- linux-4.14.orig/arch/arm/mach-prima2/platsmp.c 2017-11-12 19:46:13.000000000 +0100
544 +++ linux-4.14/arch/arm/mach-prima2/platsmp.c 2018-09-05 11:05:07.000000000 +0200
547 static void __iomem *clk_base;
549 -static DEFINE_SPINLOCK(boot_lock);
550 +static DEFINE_RAW_SPINLOCK(boot_lock);
552 static void sirfsoc_secondary_init(unsigned int cpu)
556 * Synchronise with the boot thread.
558 - spin_lock(&boot_lock);
559 - spin_unlock(&boot_lock);
560 + raw_spin_lock(&boot_lock);
561 + raw_spin_unlock(&boot_lock);
564 static const struct of_device_id clk_ids[] = {
566 /* make sure write buffer is drained */
569 - spin_lock(&boot_lock);
570 + raw_spin_lock(&boot_lock);
573 * The secondary processor is waiting to be released from
575 * now the secondary core is starting up let it run its
576 * calibrations, then wait for it to finish
578 - spin_unlock(&boot_lock);
579 + raw_spin_unlock(&boot_lock);
581 return pen_release != -1 ? -ENOSYS : 0;
583 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-qcom/platsmp.c linux-4.14/arch/arm/mach-qcom/platsmp.c
584 --- linux-4.14.orig/arch/arm/mach-qcom/platsmp.c 2017-11-12 19:46:13.000000000 +0100
585 +++ linux-4.14/arch/arm/mach-qcom/platsmp.c 2018-09-05 11:05:07.000000000 +0200
588 extern void secondary_startup_arm(void);
590 -static DEFINE_SPINLOCK(boot_lock);
591 +static DEFINE_RAW_SPINLOCK(boot_lock);
593 #ifdef CONFIG_HOTPLUG_CPU
594 static void qcom_cpu_die(unsigned int cpu)
597 * Synchronise with the boot thread.
599 - spin_lock(&boot_lock);
600 - spin_unlock(&boot_lock);
601 + raw_spin_lock(&boot_lock);
602 + raw_spin_unlock(&boot_lock);
605 static int scss_release_secondary(unsigned int cpu)
607 * set synchronisation state between this boot processor
608 * and the secondary one
610 - spin_lock(&boot_lock);
611 + raw_spin_lock(&boot_lock);
614 * Send the secondary CPU a soft interrupt, thereby causing
616 * now the secondary core is starting up let it run its
617 * calibrations, then wait for it to finish
619 - spin_unlock(&boot_lock);
620 + raw_spin_unlock(&boot_lock);
624 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-spear/platsmp.c linux-4.14/arch/arm/mach-spear/platsmp.c
625 --- linux-4.14.orig/arch/arm/mach-spear/platsmp.c 2017-11-12 19:46:13.000000000 +0100
626 +++ linux-4.14/arch/arm/mach-spear/platsmp.c 2018-09-05 11:05:07.000000000 +0200
628 sync_cache_w(&pen_release);
631 -static DEFINE_SPINLOCK(boot_lock);
632 +static DEFINE_RAW_SPINLOCK(boot_lock);
634 static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
638 * Synchronise with the boot thread.
640 - spin_lock(&boot_lock);
641 - spin_unlock(&boot_lock);
642 + raw_spin_lock(&boot_lock);
643 + raw_spin_unlock(&boot_lock);
646 static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
648 * set synchronisation state between this boot processor
649 * and the secondary one
651 - spin_lock(&boot_lock);
652 + raw_spin_lock(&boot_lock);
655 * The secondary processor is waiting to be released from
657 * now the secondary core is starting up let it run its
658 * calibrations, then wait for it to finish
660 - spin_unlock(&boot_lock);
661 + raw_spin_unlock(&boot_lock);
663 return pen_release != -1 ? -ENOSYS : 0;
665 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-sti/platsmp.c linux-4.14/arch/arm/mach-sti/platsmp.c
666 --- linux-4.14.orig/arch/arm/mach-sti/platsmp.c 2017-11-12 19:46:13.000000000 +0100
667 +++ linux-4.14/arch/arm/mach-sti/platsmp.c 2018-09-05 11:05:07.000000000 +0200
669 sync_cache_w(&pen_release);
672 -static DEFINE_SPINLOCK(boot_lock);
673 +static DEFINE_RAW_SPINLOCK(boot_lock);
675 static void sti_secondary_init(unsigned int cpu)
679 * Synchronise with the boot thread.
681 - spin_lock(&boot_lock);
682 - spin_unlock(&boot_lock);
683 + raw_spin_lock(&boot_lock);
684 + raw_spin_unlock(&boot_lock);
687 static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
689 * set synchronisation state between this boot processor
690 * and the secondary one
692 - spin_lock(&boot_lock);
693 + raw_spin_lock(&boot_lock);
696 * The secondary processor is waiting to be released from
698 * now the secondary core is starting up let it run its
699 * calibrations, then wait for it to finish
701 - spin_unlock(&boot_lock);
702 + raw_spin_unlock(&boot_lock);
704 return pen_release != -1 ? -ENOSYS : 0;
706 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mm/fault.c linux-4.14/arch/arm/mm/fault.c
707 --- linux-4.14.orig/arch/arm/mm/fault.c 2017-11-12 19:46:13.000000000 +0100
708 +++ linux-4.14/arch/arm/mm/fault.c 2018-09-05 11:05:07.000000000 +0200
710 if (addr < TASK_SIZE)
711 return do_page_fault(addr, fsr, regs);
713 + if (interrupts_enabled(regs))
714 + local_irq_enable();
721 do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
723 + if (interrupts_enabled(regs))
724 + local_irq_enable();
726 do_bad_area(addr, fsr, regs);
729 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mm/highmem.c linux-4.14/arch/arm/mm/highmem.c
730 --- linux-4.14.orig/arch/arm/mm/highmem.c 2017-11-12 19:46:13.000000000 +0100
731 +++ linux-4.14/arch/arm/mm/highmem.c 2018-09-05 11:05:07.000000000 +0200
736 +static unsigned int fixmap_idx(int type)
738 + return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
741 void *kmap(struct page *page)
746 void *kmap_atomic(struct page *page)
748 + pte_t pte = mk_pte(page, kmap_prot);
755 + preempt_disable_nort();
757 if (!PageHighMem(page))
758 return page_address(page);
761 type = kmap_atomic_idx_push();
763 - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
764 + idx = fixmap_idx(type);
765 vaddr = __fix_to_virt(idx);
766 #ifdef CONFIG_DEBUG_HIGHMEM
769 * in place, so the contained TLB flush ensures the TLB is updated
770 * with the new mapping.
772 - set_fixmap_pte(idx, mk_pte(page, kmap_prot));
773 +#ifdef CONFIG_PREEMPT_RT_FULL
774 + current->kmap_pte[type] = pte;
776 + set_fixmap_pte(idx, pte);
778 return (void *)vaddr;
780 @@ -106,44 +115,75 @@
782 if (kvaddr >= (void *)FIXADDR_START) {
783 type = kmap_atomic_idx();
784 - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
785 + idx = fixmap_idx(type);
788 __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
789 +#ifdef CONFIG_PREEMPT_RT_FULL
790 + current->kmap_pte[type] = __pte(0);
792 #ifdef CONFIG_DEBUG_HIGHMEM
793 BUG_ON(vaddr != __fix_to_virt(idx));
794 - set_fixmap_pte(idx, __pte(0));
796 (void) idx; /* to kill a warning */
798 + set_fixmap_pte(idx, __pte(0));
799 kmap_atomic_idx_pop();
800 } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
801 /* this address was obtained through kmap_high_get() */
802 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
806 + preempt_enable_nort();
808 EXPORT_SYMBOL(__kunmap_atomic);
810 void *kmap_atomic_pfn(unsigned long pfn)
812 + pte_t pte = pfn_pte(pfn, kmap_prot);
815 struct page *page = pfn_to_page(pfn);
818 + preempt_disable_nort();
820 if (!PageHighMem(page))
821 return page_address(page);
823 type = kmap_atomic_idx_push();
824 - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
825 + idx = fixmap_idx(type);
826 vaddr = __fix_to_virt(idx);
827 #ifdef CONFIG_DEBUG_HIGHMEM
828 BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
830 - set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
831 +#ifdef CONFIG_PREEMPT_RT_FULL
832 + current->kmap_pte[type] = pte;
834 + set_fixmap_pte(idx, pte);
836 return (void *)vaddr;
838 +#if defined CONFIG_PREEMPT_RT_FULL
839 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
844 + * Clear @prev's kmap_atomic mappings
846 + for (i = 0; i < prev_p->kmap_idx; i++) {
847 + int idx = fixmap_idx(i);
849 + set_fixmap_pte(idx, __pte(0));
852 + * Restore @next_p's kmap_atomic mappings
854 + for (i = 0; i < next_p->kmap_idx; i++) {
855 + int idx = fixmap_idx(i);
857 + if (!pte_none(next_p->kmap_pte[i]))
858 + set_fixmap_pte(idx, next_p->kmap_pte[i]);
862 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/plat-versatile/platsmp.c linux-4.14/arch/arm/plat-versatile/platsmp.c
863 --- linux-4.14.orig/arch/arm/plat-versatile/platsmp.c 2017-11-12 19:46:13.000000000 +0100
864 +++ linux-4.14/arch/arm/plat-versatile/platsmp.c 2018-09-05 11:05:07.000000000 +0200
866 sync_cache_w(&pen_release);
869 -static DEFINE_SPINLOCK(boot_lock);
870 +static DEFINE_RAW_SPINLOCK(boot_lock);
872 void versatile_secondary_init(unsigned int cpu)
876 * Synchronise with the boot thread.
878 - spin_lock(&boot_lock);
879 - spin_unlock(&boot_lock);
880 + raw_spin_lock(&boot_lock);
881 + raw_spin_unlock(&boot_lock);
884 int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
886 * Set synchronisation state between this boot processor
887 * and the secondary one
889 - spin_lock(&boot_lock);
890 + raw_spin_lock(&boot_lock);
893 * This is really belt and braces; we hold unintended secondary
895 * now the secondary core is starting up let it run its
896 * calibrations, then wait for it to finish
898 - spin_unlock(&boot_lock);
899 + raw_spin_unlock(&boot_lock);
901 return pen_release != -1 ? -ENOSYS : 0;
903 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/crypto/crc32-ce-glue.c linux-4.14/arch/arm64/crypto/crc32-ce-glue.c
904 --- linux-4.14.orig/arch/arm64/crypto/crc32-ce-glue.c 2018-09-05 11:03:20.000000000 +0200
905 +++ linux-4.14/arch/arm64/crypto/crc32-ce-glue.c 2018-09-05 11:05:07.000000000 +0200
908 static int __init crc32_pmull_mod_init(void)
910 - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
911 + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
912 + !IS_ENABLED(CONFIG_PREEMPT_RT_BASE) && (elf_hwcap & HWCAP_PMULL)) {
913 crc32_pmull_algs[0].update = crc32_pmull_update;
914 crc32_pmull_algs[1].update = crc32c_pmull_update;
916 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/crypto/Kconfig linux-4.14/arch/arm64/crypto/Kconfig
917 --- linux-4.14.orig/arch/arm64/crypto/Kconfig 2017-11-12 19:46:13.000000000 +0100
918 +++ linux-4.14/arch/arm64/crypto/Kconfig 2018-09-05 11:05:07.000000000 +0200
921 config CRYPTO_SHA1_ARM64_CE
922 tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
923 - depends on KERNEL_MODE_NEON
924 + depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
928 config CRYPTO_SHA2_ARM64_CE
929 tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
930 - depends on KERNEL_MODE_NEON
931 + depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
933 select CRYPTO_SHA256_ARM64
935 config CRYPTO_GHASH_ARM64_CE
936 tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
937 - depends on KERNEL_MODE_NEON
938 + depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
940 select CRYPTO_GF128MUL
944 config CRYPTO_CRCT10DIF_ARM64_CE
945 tristate "CRCT10DIF digest algorithm using PMULL instructions"
946 - depends on KERNEL_MODE_NEON && CRC_T10DIF
947 + depends on KERNEL_MODE_NEON && CRC_T10DIF && !PREEMPT_RT_BASE
950 config CRYPTO_CRC32_ARM64_CE
953 config CRYPTO_AES_ARM64_CE
954 tristate "AES core cipher using ARMv8 Crypto Extensions"
955 - depends on ARM64 && KERNEL_MODE_NEON
956 + depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
958 select CRYPTO_AES_ARM64
960 config CRYPTO_AES_ARM64_CE_CCM
961 tristate "AES in CCM mode using ARMv8 Crypto Extensions"
962 - depends on ARM64 && KERNEL_MODE_NEON
963 + depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
965 select CRYPTO_AES_ARM64_CE
966 select CRYPTO_AES_ARM64
969 config CRYPTO_AES_ARM64_CE_BLK
970 tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
971 - depends on KERNEL_MODE_NEON
972 + depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
973 select CRYPTO_BLKCIPHER
974 select CRYPTO_AES_ARM64_CE
975 select CRYPTO_AES_ARM64
978 config CRYPTO_AES_ARM64_NEON_BLK
979 tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
980 - depends on KERNEL_MODE_NEON
981 + depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
982 select CRYPTO_BLKCIPHER
983 select CRYPTO_AES_ARM64
987 config CRYPTO_CHACHA20_NEON
988 tristate "NEON accelerated ChaCha20 symmetric cipher"
989 - depends on KERNEL_MODE_NEON
990 + depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
991 select CRYPTO_BLKCIPHER
992 select CRYPTO_CHACHA20
994 config CRYPTO_AES_ARM64_BS
995 tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
996 - depends on KERNEL_MODE_NEON
997 + depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
998 select CRYPTO_BLKCIPHER
999 select CRYPTO_AES_ARM64_NEON_BLK
1000 select CRYPTO_AES_ARM64
1001 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/include/asm/spinlock_types.h linux-4.14/arch/arm64/include/asm/spinlock_types.h
1002 --- linux-4.14.orig/arch/arm64/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1003 +++ linux-4.14/arch/arm64/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1005 #ifndef __ASM_SPINLOCK_TYPES_H
1006 #define __ASM_SPINLOCK_TYPES_H
1008 -#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H)
1009 -# error "please don't include this file directly"
1012 #include <linux/types.h>
1014 #define TICKET_SHIFT 16
1015 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/include/asm/thread_info.h linux-4.14/arch/arm64/include/asm/thread_info.h
1016 --- linux-4.14.orig/arch/arm64/include/asm/thread_info.h 2018-09-05 11:03:20.000000000 +0200
1017 +++ linux-4.14/arch/arm64/include/asm/thread_info.h 2018-09-05 11:05:07.000000000 +0200
1019 u64 ttbr0; /* saved TTBR0_EL1 */
1021 int preempt_count; /* 0 => preemptable, <0 => bug */
1022 + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */
1025 #define INIT_THREAD_INFO(tsk) \
1027 #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */
1028 #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */
1029 #define TIF_FSCHECK 5 /* Check FS is USER_DS on return */
1030 +#define TIF_NEED_RESCHED_LAZY 6
1032 #define TIF_SYSCALL_TRACE 8
1033 #define TIF_SYSCALL_AUDIT 9
1035 #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
1036 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
1037 #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE)
1038 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
1039 #define _TIF_NOHZ (1 << TIF_NOHZ)
1040 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
1041 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
1044 #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
1045 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
1046 - _TIF_UPROBE | _TIF_FSCHECK)
1047 + _TIF_UPROBE | _TIF_FSCHECK | _TIF_NEED_RESCHED_LAZY)
1049 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1050 #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1051 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
1053 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/Kconfig linux-4.14/arch/arm64/Kconfig
1054 --- linux-4.14.orig/arch/arm64/Kconfig 2018-09-05 11:03:20.000000000 +0200
1055 +++ linux-4.14/arch/arm64/Kconfig 2018-09-05 11:05:07.000000000 +0200
1057 select HAVE_PERF_EVENTS
1058 select HAVE_PERF_REGS
1059 select HAVE_PERF_USER_STACK_DUMP
1060 + select HAVE_PREEMPT_LAZY
1061 select HAVE_REGS_AND_STACK_ACCESS_API
1062 select HAVE_RCU_TABLE_FREE
1063 select HAVE_SYSCALL_TRACEPOINTS
1067 bool "Xen guest support on ARM64"
1068 - depends on ARM64 && OF
1069 + depends on ARM64 && OF && !PREEMPT_RT_FULL
1073 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/kernel/asm-offsets.c linux-4.14/arch/arm64/kernel/asm-offsets.c
1074 --- linux-4.14.orig/arch/arm64/kernel/asm-offsets.c 2018-09-05 11:03:20.000000000 +0200
1075 +++ linux-4.14/arch/arm64/kernel/asm-offsets.c 2018-09-05 11:05:07.000000000 +0200
1078 DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags));
1079 DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count));
1080 + DEFINE(TSK_TI_PREEMPT_LAZY, offsetof(struct task_struct, thread_info.preempt_lazy_count));
1081 DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit));
1082 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
1083 DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0));
1084 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/kernel/entry.S linux-4.14/arch/arm64/kernel/entry.S
1085 --- linux-4.14.orig/arch/arm64/kernel/entry.S 2018-09-05 11:03:20.000000000 +0200
1086 +++ linux-4.14/arch/arm64/kernel/entry.S 2018-09-05 11:05:07.000000000 +0200
1087 @@ -637,11 +637,16 @@
1089 #ifdef CONFIG_PREEMPT
1090 ldr w24, [tsk, #TSK_TI_PREEMPT] // get preempt count
1091 - cbnz w24, 1f // preempt count != 0
1092 + cbnz w24, 2f // preempt count != 0
1093 ldr x0, [tsk, #TSK_TI_FLAGS] // get flags
1094 - tbz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling?
1096 + tbnz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling?
1098 + ldr w24, [tsk, #TSK_TI_PREEMPT_LAZY] // get preempt lazy count
1099 + cbnz w24, 2f // preempt lazy count != 0
1100 + tbz x0, #TIF_NEED_RESCHED_LAZY, 2f // needs rescheduling?
1105 #ifdef CONFIG_TRACE_IRQFLAGS
1106 bl trace_hardirqs_on
1108 1: bl preempt_schedule_irq // irq en/disable is done inside
1109 ldr x0, [tsk, #TSK_TI_FLAGS] // get new tasks TI_FLAGS
1110 tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling?
1111 + tbnz x0, #TIF_NEED_RESCHED_LAZY, 1b // needs rescheduling?
1115 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/kernel/signal.c linux-4.14/arch/arm64/kernel/signal.c
1116 --- linux-4.14.orig/arch/arm64/kernel/signal.c 2018-09-05 11:03:20.000000000 +0200
1117 +++ linux-4.14/arch/arm64/kernel/signal.c 2018-09-05 11:05:07.000000000 +0200
1119 /* Check valid user FS if needed */
1120 addr_limit_user_check();
1122 - if (thread_flags & _TIF_NEED_RESCHED) {
1123 + if (thread_flags & _TIF_NEED_RESCHED_MASK) {
1127 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/blackfin/include/asm/spinlock_types.h linux-4.14/arch/blackfin/include/asm/spinlock_types.h
1128 --- linux-4.14.orig/arch/blackfin/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1129 +++ linux-4.14/arch/blackfin/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1131 #ifndef __ASM_SPINLOCK_TYPES_H
1132 #define __ASM_SPINLOCK_TYPES_H
1134 -#ifndef __LINUX_SPINLOCK_TYPES_H
1135 -# error "please don't include this file directly"
1138 #include <asm/rwlock.h>
1141 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/hexagon/include/asm/spinlock_types.h linux-4.14/arch/hexagon/include/asm/spinlock_types.h
1142 --- linux-4.14.orig/arch/hexagon/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1143 +++ linux-4.14/arch/hexagon/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1145 #ifndef _ASM_SPINLOCK_TYPES_H
1146 #define _ASM_SPINLOCK_TYPES_H
1148 -#ifndef __LINUX_SPINLOCK_TYPES_H
1149 -# error "please don't include this file directly"
1153 volatile unsigned int lock;
1155 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/ia64/include/asm/spinlock_types.h linux-4.14/arch/ia64/include/asm/spinlock_types.h
1156 --- linux-4.14.orig/arch/ia64/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1157 +++ linux-4.14/arch/ia64/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1159 #ifndef _ASM_IA64_SPINLOCK_TYPES_H
1160 #define _ASM_IA64_SPINLOCK_TYPES_H
1162 -#ifndef __LINUX_SPINLOCK_TYPES_H
1163 -# error "please don't include this file directly"
1167 volatile unsigned int lock;
1169 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/ia64/kernel/mca.c linux-4.14/arch/ia64/kernel/mca.c
1170 --- linux-4.14.orig/arch/ia64/kernel/mca.c 2017-11-12 19:46:13.000000000 +0100
1171 +++ linux-4.14/arch/ia64/kernel/mca.c 2018-09-05 11:05:07.000000000 +0200
1172 @@ -1824,7 +1824,7 @@
1175 p->state = TASK_UNINTERRUPTIBLE;
1176 - cpumask_set_cpu(cpu, &p->cpus_allowed);
1177 + cpumask_set_cpu(cpu, &p->cpus_mask);
1178 INIT_LIST_HEAD(&p->tasks);
1179 p->parent = p->real_parent = p->group_leader = p;
1180 INIT_LIST_HEAD(&p->children);
1181 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/Kconfig linux-4.14/arch/Kconfig
1182 --- linux-4.14.orig/arch/Kconfig 2018-09-05 11:03:20.000000000 +0200
1183 +++ linux-4.14/arch/Kconfig 2018-09-05 11:05:07.000000000 +0200
1185 tristate "OProfile system profiling"
1186 depends on PROFILING
1187 depends on HAVE_OPROFILE
1188 + depends on !PREEMPT_RT_FULL
1190 select RING_BUFFER_ALLOW_SWAP
1192 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/m32r/include/asm/spinlock_types.h linux-4.14/arch/m32r/include/asm/spinlock_types.h
1193 --- linux-4.14.orig/arch/m32r/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1194 +++ linux-4.14/arch/m32r/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1196 #ifndef _ASM_M32R_SPINLOCK_TYPES_H
1197 #define _ASM_M32R_SPINLOCK_TYPES_H
1199 -#ifndef __LINUX_SPINLOCK_TYPES_H
1200 -# error "please don't include this file directly"
1206 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/metag/include/asm/spinlock_types.h linux-4.14/arch/metag/include/asm/spinlock_types.h
1207 --- linux-4.14.orig/arch/metag/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1208 +++ linux-4.14/arch/metag/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1210 #ifndef _ASM_METAG_SPINLOCK_TYPES_H
1211 #define _ASM_METAG_SPINLOCK_TYPES_H
1213 -#ifndef __LINUX_SPINLOCK_TYPES_H
1214 -# error "please don't include this file directly"
1218 volatile unsigned int lock;
1220 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/mips/include/asm/switch_to.h linux-4.14/arch/mips/include/asm/switch_to.h
1221 --- linux-4.14.orig/arch/mips/include/asm/switch_to.h 2017-11-12 19:46:13.000000000 +0100
1222 +++ linux-4.14/arch/mips/include/asm/switch_to.h 2018-09-05 11:05:07.000000000 +0200
1224 * inline to try to keep the overhead down. If we have been forced to run on
1225 * a "CPU" with an FPU because of a previous high level of FP computation,
1226 * but did not actually use the FPU during the most recent time-slice (CU1
1227 - * isn't set), we undo the restriction on cpus_allowed.
1228 + * isn't set), we undo the restriction on cpus_mask.
1230 * We're not calling set_cpus_allowed() here, because we have no need to
1231 * force prompt migration - we're already switching the current CPU to a
1233 test_ti_thread_flag(__prev_ti, TIF_FPUBOUND) && \
1234 (!(KSTK_STATUS(prev) & ST0_CU1))) { \
1235 clear_ti_thread_flag(__prev_ti, TIF_FPUBOUND); \
1236 - prev->cpus_allowed = prev->thread.user_cpus_allowed; \
1237 + prev->cpus_mask = prev->thread.user_cpus_allowed; \
1239 next->thread.emulated_fp = 0; \
1241 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/mips/Kconfig linux-4.14/arch/mips/Kconfig
1242 --- linux-4.14.orig/arch/mips/Kconfig 2018-09-05 11:03:20.000000000 +0200
1243 +++ linux-4.14/arch/mips/Kconfig 2018-09-05 11:05:07.000000000 +0200
1244 @@ -2519,7 +2519,7 @@
1247 bool "High Memory Support"
1248 - depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
1249 + depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
1251 config CPU_SUPPORTS_HIGHMEM
1253 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/mips/kernel/mips-mt-fpaff.c linux-4.14/arch/mips/kernel/mips-mt-fpaff.c
1254 --- linux-4.14.orig/arch/mips/kernel/mips-mt-fpaff.c 2017-11-12 19:46:13.000000000 +0100
1255 +++ linux-4.14/arch/mips/kernel/mips-mt-fpaff.c 2018-09-05 11:05:07.000000000 +0200
1260 - cpumask_or(&allowed, &p->thread.user_cpus_allowed, &p->cpus_allowed);
1261 + cpumask_or(&allowed, &p->thread.user_cpus_allowed, p->cpus_ptr);
1262 cpumask_and(&mask, &allowed, cpu_active_mask);
1265 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/mips/kernel/traps.c linux-4.14/arch/mips/kernel/traps.c
1266 --- linux-4.14.orig/arch/mips/kernel/traps.c 2018-09-05 11:03:20.000000000 +0200
1267 +++ linux-4.14/arch/mips/kernel/traps.c 2018-09-05 11:05:07.000000000 +0200
1268 @@ -1193,12 +1193,12 @@
1269 * restricted the allowed set to exclude any CPUs with FPUs,
1270 * we'll skip the procedure.
1272 - if (cpumask_intersects(¤t->cpus_allowed, &mt_fpu_cpumask)) {
1273 + if (cpumask_intersects(¤t->cpus_mask, &mt_fpu_cpumask)) {
1276 current->thread.user_cpus_allowed
1277 - = current->cpus_allowed;
1278 - cpumask_and(&tmask, ¤t->cpus_allowed,
1279 + = current->cpus_mask;
1280 + cpumask_and(&tmask, ¤t->cpus_mask,
1282 set_cpus_allowed_ptr(current, &tmask);
1283 set_thread_flag(TIF_FPUBOUND);
1284 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/mn10300/include/asm/spinlock_types.h linux-4.14/arch/mn10300/include/asm/spinlock_types.h
1285 --- linux-4.14.orig/arch/mn10300/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1286 +++ linux-4.14/arch/mn10300/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1288 #ifndef _ASM_SPINLOCK_TYPES_H
1289 #define _ASM_SPINLOCK_TYPES_H
1291 -#ifndef __LINUX_SPINLOCK_TYPES_H
1292 -# error "please don't include this file directly"
1295 typedef struct arch_spinlock {
1298 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/include/asm/spinlock_types.h linux-4.14/arch/powerpc/include/asm/spinlock_types.h
1299 --- linux-4.14.orig/arch/powerpc/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1300 +++ linux-4.14/arch/powerpc/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1302 #ifndef _ASM_POWERPC_SPINLOCK_TYPES_H
1303 #define _ASM_POWERPC_SPINLOCK_TYPES_H
1305 -#ifndef __LINUX_SPINLOCK_TYPES_H
1306 -# error "please don't include this file directly"
1310 volatile unsigned int slock;
1312 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/include/asm/thread_info.h linux-4.14/arch/powerpc/include/asm/thread_info.h
1313 --- linux-4.14.orig/arch/powerpc/include/asm/thread_info.h 2017-11-12 19:46:13.000000000 +0100
1314 +++ linux-4.14/arch/powerpc/include/asm/thread_info.h 2018-09-05 11:05:07.000000000 +0200
1316 int cpu; /* cpu we're on */
1317 int preempt_count; /* 0 => preemptable,
1319 + int preempt_lazy_count; /* 0 => preemptable,
1321 unsigned long local_flags; /* private flags for thread */
1322 #ifdef CONFIG_LIVEPATCH
1323 unsigned long *livepatch_sp;
1325 #define TIF_SYSCALL_TRACE 0 /* syscall trace active */
1326 #define TIF_SIGPENDING 1 /* signal pending */
1327 #define TIF_NEED_RESCHED 2 /* rescheduling necessary */
1328 -#define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling
1329 - TIF_NEED_RESCHED */
1330 +#define TIF_NEED_RESCHED_LAZY 3 /* lazy rescheduling necessary */
1331 #define TIF_32BIT 4 /* 32 bit binary */
1332 #define TIF_RESTORE_TM 5 /* need to restore TM FP/VEC/VSX */
1333 #define TIF_PATCH_PENDING 6 /* pending live patching update */
1335 #if defined(CONFIG_PPC64)
1336 #define TIF_ELF2ABI 18 /* function descriptors must die! */
1338 +#define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling
1339 + TIF_NEED_RESCHED */
1341 /* as above, but as bit values */
1342 #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
1343 @@ -120,14 +123,16 @@
1344 #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT)
1345 #define _TIF_EMULATE_STACK_STORE (1<<TIF_EMULATE_STACK_STORE)
1346 #define _TIF_NOHZ (1<<TIF_NOHZ)
1347 +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
1348 #define _TIF_SYSCALL_DOTRACE (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1349 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
1352 #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
1353 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
1354 - _TIF_RESTORE_TM | _TIF_PATCH_PENDING)
1355 + _TIF_RESTORE_TM | _TIF_PATCH_PENDING | _TIF_NEED_RESCHED_LAZY)
1356 #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR)
1357 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1359 /* Bits in local_flags */
1360 /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
1361 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/Kconfig linux-4.14/arch/powerpc/Kconfig
1362 --- linux-4.14.orig/arch/powerpc/Kconfig 2018-09-05 11:03:20.000000000 +0200
1363 +++ linux-4.14/arch/powerpc/Kconfig 2018-09-05 11:05:07.000000000 +0200
1364 @@ -111,10 +111,11 @@
1366 config RWSEM_GENERIC_SPINLOCK
1368 + default y if PREEMPT_RT_FULL
1370 config RWSEM_XCHGADD_ALGORITHM
1373 + default y if !PREEMPT_RT_FULL
1375 config GENERIC_LOCKBREAK
1378 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
1379 select HAVE_PERF_REGS
1380 select HAVE_PERF_USER_STACK_DUMP
1381 + select HAVE_PREEMPT_LAZY
1382 select HAVE_RCU_TABLE_FREE if SMP
1383 select HAVE_REGS_AND_STACK_ACCESS_API
1384 select HAVE_SYSCALL_TRACEPOINTS
1388 bool "High memory support"
1390 + depends on PPC32 && !PREEMPT_RT_FULL
1392 source kernel/Kconfig.hz
1393 source kernel/Kconfig.preempt
1394 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kernel/asm-offsets.c linux-4.14/arch/powerpc/kernel/asm-offsets.c
1395 --- linux-4.14.orig/arch/powerpc/kernel/asm-offsets.c 2018-09-05 11:03:20.000000000 +0200
1396 +++ linux-4.14/arch/powerpc/kernel/asm-offsets.c 2018-09-05 11:05:07.000000000 +0200
1398 OFFSET(TI_FLAGS, thread_info, flags);
1399 OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags);
1400 OFFSET(TI_PREEMPT, thread_info, preempt_count);
1401 + OFFSET(TI_PREEMPT_LAZY, thread_info, preempt_lazy_count);
1402 OFFSET(TI_TASK, thread_info, task);
1403 OFFSET(TI_CPU, thread_info, cpu);
1405 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kernel/entry_32.S linux-4.14/arch/powerpc/kernel/entry_32.S
1406 --- linux-4.14.orig/arch/powerpc/kernel/entry_32.S 2017-11-12 19:46:13.000000000 +0100
1407 +++ linux-4.14/arch/powerpc/kernel/entry_32.S 2018-09-05 11:05:07.000000000 +0200
1408 @@ -866,7 +866,14 @@
1409 cmpwi 0,r0,0 /* if non-zero, just restore regs and return */
1411 andi. r8,r8,_TIF_NEED_RESCHED
1413 + lwz r0,TI_PREEMPT_LAZY(r9)
1414 + cmpwi 0,r0,0 /* if non-zero, just restore regs and return */
1416 + lwz r0,TI_FLAGS(r9)
1417 + andi. r0,r0,_TIF_NEED_RESCHED_LAZY
1421 andi. r0,r3,MSR_EE /* interrupts off? */
1422 beq restore /* don't schedule if so */
1423 @@ -877,11 +884,11 @@
1425 bl trace_hardirqs_off
1427 -1: bl preempt_schedule_irq
1428 +2: bl preempt_schedule_irq
1429 CURRENT_THREAD_INFO(r9, r1)
1431 - andi. r0,r3,_TIF_NEED_RESCHED
1433 + andi. r0,r3,_TIF_NEED_RESCHED_MASK
1435 #ifdef CONFIG_TRACE_IRQFLAGS
1436 /* And now, to properly rebalance the above, we tell lockdep they
1437 * are being turned back on, which will happen when we return
1438 @@ -1204,7 +1211,7 @@
1439 #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
1441 do_work: /* r10 contains MSR_KERNEL here */
1442 - andi. r0,r9,_TIF_NEED_RESCHED
1443 + andi. r0,r9,_TIF_NEED_RESCHED_MASK
1446 do_resched: /* r10 contains MSR_KERNEL here */
1447 @@ -1225,7 +1232,7 @@
1448 MTMSRD(r10) /* disable interrupts */
1449 CURRENT_THREAD_INFO(r9, r1)
1451 - andi. r0,r9,_TIF_NEED_RESCHED
1452 + andi. r0,r9,_TIF_NEED_RESCHED_MASK
1454 andi. r0,r9,_TIF_USER_WORK_MASK
1456 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kernel/entry_64.S linux-4.14/arch/powerpc/kernel/entry_64.S
1457 --- linux-4.14.orig/arch/powerpc/kernel/entry_64.S 2018-09-05 11:03:20.000000000 +0200
1458 +++ linux-4.14/arch/powerpc/kernel/entry_64.S 2018-09-05 11:05:07.000000000 +0200
1463 -1: andi. r0,r4,_TIF_NEED_RESCHED
1464 +1: andi. r0,r4,_TIF_NEED_RESCHED_MASK
1466 bl restore_interrupts
1468 @@ -752,10 +752,18 @@
1470 #ifdef CONFIG_PREEMPT
1471 /* Check if we need to preempt */
1472 + lwz r8,TI_PREEMPT(r9)
1473 + cmpwi 0,r8,0 /* if non-zero, just restore regs and return */
1475 andi. r0,r4,_TIF_NEED_RESCHED
1478 + andi. r0,r4,_TIF_NEED_RESCHED_LAZY
1480 + lwz r8,TI_PREEMPT_LAZY(r9)
1482 /* Check that preempt_count() == 0 and interrupts are enabled */
1483 - lwz r8,TI_PREEMPT(r9)
1489 /* Re-test flags and eventually loop */
1490 CURRENT_THREAD_INFO(r9, r1)
1492 - andi. r0,r4,_TIF_NEED_RESCHED
1493 + andi. r0,r4,_TIF_NEED_RESCHED_MASK
1497 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kernel/irq.c linux-4.14/arch/powerpc/kernel/irq.c
1498 --- linux-4.14.orig/arch/powerpc/kernel/irq.c 2018-09-05 11:03:20.000000000 +0200
1499 +++ linux-4.14/arch/powerpc/kernel/irq.c 2018-09-05 11:05:07.000000000 +0200
1504 +#ifndef CONFIG_PREEMPT_RT_FULL
1505 void do_softirq_own_stack(void)
1507 struct thread_info *curtp, *irqtp;
1510 set_bits(irqtp->flags, &curtp->flags);
1514 irq_hw_number_t virq_to_hw(unsigned int virq)
1516 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kernel/misc_32.S linux-4.14/arch/powerpc/kernel/misc_32.S
1517 --- linux-4.14.orig/arch/powerpc/kernel/misc_32.S 2017-11-12 19:46:13.000000000 +0100
1518 +++ linux-4.14/arch/powerpc/kernel/misc_32.S 2018-09-05 11:05:07.000000000 +0200
1520 * We store the saved ksp_limit in the unused part
1521 * of the STACK_FRAME_OVERHEAD
1523 +#ifndef CONFIG_PREEMPT_RT_FULL
1524 _GLOBAL(call_do_softirq)
1528 stw r10,THREAD+KSP_LIMIT(r2)
1534 * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
1535 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kernel/misc_64.S linux-4.14/arch/powerpc/kernel/misc_64.S
1536 --- linux-4.14.orig/arch/powerpc/kernel/misc_64.S 2018-09-05 11:03:20.000000000 +0200
1537 +++ linux-4.14/arch/powerpc/kernel/misc_64.S 2018-09-05 11:05:07.000000000 +0200
1542 +#ifndef CONFIG_PREEMPT_RT_FULL
1543 _GLOBAL(call_do_softirq)
1552 _GLOBAL(call_do_irq)
1554 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kvm/Kconfig linux-4.14/arch/powerpc/kvm/Kconfig
1555 --- linux-4.14.orig/arch/powerpc/kvm/Kconfig 2018-09-05 11:03:20.000000000 +0200
1556 +++ linux-4.14/arch/powerpc/kvm/Kconfig 2018-09-05 11:05:07.000000000 +0200
1559 bool "KVM in-kernel MPIC emulation"
1560 depends on KVM && E500
1561 + depends on !PREEMPT_RT_FULL
1562 select HAVE_KVM_IRQCHIP
1563 select HAVE_KVM_IRQFD
1564 select HAVE_KVM_IRQ_ROUTING
1565 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/platforms/cell/spufs/sched.c linux-4.14/arch/powerpc/platforms/cell/spufs/sched.c
1566 --- linux-4.14.orig/arch/powerpc/platforms/cell/spufs/sched.c 2017-11-12 19:46:13.000000000 +0100
1567 +++ linux-4.14/arch/powerpc/platforms/cell/spufs/sched.c 2018-09-05 11:05:07.000000000 +0200
1569 * runqueue. The context will be rescheduled on the proper node
1570 * if it is timesliced or preempted.
1572 - cpumask_copy(&ctx->cpus_allowed, ¤t->cpus_allowed);
1573 + cpumask_copy(&ctx->cpus_allowed, current->cpus_ptr);
1575 /* Save the current cpu id for spu interrupt routing. */
1576 ctx->last_ran = raw_smp_processor_id();
1577 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/platforms/ps3/device-init.c linux-4.14/arch/powerpc/platforms/ps3/device-init.c
1578 --- linux-4.14.orig/arch/powerpc/platforms/ps3/device-init.c 2017-11-12 19:46:13.000000000 +0100
1579 +++ linux-4.14/arch/powerpc/platforms/ps3/device-init.c 2018-09-05 11:05:07.000000000 +0200
1582 pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
1584 - res = wait_event_interruptible(dev->done.wait,
1585 + res = swait_event_interruptible(dev->done.wait,
1586 dev->done.done || kthread_should_stop());
1587 if (kthread_should_stop())
1589 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/s390/include/asm/spinlock_types.h linux-4.14/arch/s390/include/asm/spinlock_types.h
1590 --- linux-4.14.orig/arch/s390/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1591 +++ linux-4.14/arch/s390/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1593 #ifndef __ASM_SPINLOCK_TYPES_H
1594 #define __ASM_SPINLOCK_TYPES_H
1596 -#ifndef __LINUX_SPINLOCK_TYPES_H
1597 -# error "please don't include this file directly"
1602 } __attribute__ ((aligned (4))) arch_spinlock_t;
1603 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/sh/include/asm/spinlock_types.h linux-4.14/arch/sh/include/asm/spinlock_types.h
1604 --- linux-4.14.orig/arch/sh/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1605 +++ linux-4.14/arch/sh/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1607 #ifndef __ASM_SH_SPINLOCK_TYPES_H
1608 #define __ASM_SH_SPINLOCK_TYPES_H
1610 -#ifndef __LINUX_SPINLOCK_TYPES_H
1611 -# error "please don't include this file directly"
1615 volatile unsigned int lock;
1617 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/sh/kernel/irq.c linux-4.14/arch/sh/kernel/irq.c
1618 --- linux-4.14.orig/arch/sh/kernel/irq.c 2017-11-12 19:46:13.000000000 +0100
1619 +++ linux-4.14/arch/sh/kernel/irq.c 2018-09-05 11:05:07.000000000 +0200
1621 hardirq_ctx[cpu] = NULL;
1624 +#ifndef CONFIG_PREEMPT_RT_FULL
1625 void do_softirq_own_stack(void)
1627 struct thread_info *curctx;
1629 "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
1634 static inline void handle_one_irq(unsigned int irq)
1636 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/sparc/Kconfig linux-4.14/arch/sparc/Kconfig
1637 --- linux-4.14.orig/arch/sparc/Kconfig 2017-11-12 19:46:13.000000000 +0100
1638 +++ linux-4.14/arch/sparc/Kconfig 2018-09-05 11:05:07.000000000 +0200
1639 @@ -206,12 +206,10 @@
1640 source kernel/Kconfig.hz
1642 config RWSEM_GENERIC_SPINLOCK
1644 - default y if SPARC32
1645 + def_bool PREEMPT_RT_FULL
1647 config RWSEM_XCHGADD_ALGORITHM
1649 - default y if SPARC64
1650 + def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1652 config GENERIC_HWEIGHT
1654 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/sparc/kernel/irq_64.c linux-4.14/arch/sparc/kernel/irq_64.c
1655 --- linux-4.14.orig/arch/sparc/kernel/irq_64.c 2017-11-12 19:46:13.000000000 +0100
1656 +++ linux-4.14/arch/sparc/kernel/irq_64.c 2018-09-05 11:05:07.000000000 +0200
1658 set_irq_regs(old_regs);
1661 +#ifndef CONFIG_PREEMPT_RT_FULL
1662 void do_softirq_own_stack(void)
1664 void *orig_sp, *sp = softirq_stack[smp_processor_id()];
1666 __asm__ __volatile__("mov %0, %%sp"
1671 #ifdef CONFIG_HOTPLUG_CPU
1672 void fixup_irqs(void)
1673 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/tile/include/asm/setup.h linux-4.14/arch/tile/include/asm/setup.h
1674 --- linux-4.14.orig/arch/tile/include/asm/setup.h 2017-11-12 19:46:13.000000000 +0100
1675 +++ linux-4.14/arch/tile/include/asm/setup.h 2018-09-05 11:05:07.000000000 +0200
1678 /* Hook hardwall code into changes in affinity. */
1679 #define arch_set_cpus_allowed(p, new_mask) do { \
1680 - if (!cpumask_equal(&p->cpus_allowed, new_mask)) \
1681 + if (!cpumask_equal(p->cpus_ptr, new_mask)) \
1682 hardwall_deactivate_all(p); \
1685 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/tile/include/asm/spinlock_types.h linux-4.14/arch/tile/include/asm/spinlock_types.h
1686 --- linux-4.14.orig/arch/tile/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1687 +++ linux-4.14/arch/tile/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1689 #ifndef _ASM_TILE_SPINLOCK_TYPES_H
1690 #define _ASM_TILE_SPINLOCK_TYPES_H
1692 -#ifndef __LINUX_SPINLOCK_TYPES_H
1693 -# error "please don't include this file directly"
1698 /* Low 15 bits are "next"; high 15 bits are "current". */
1699 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/tile/kernel/hardwall.c linux-4.14/arch/tile/kernel/hardwall.c
1700 --- linux-4.14.orig/arch/tile/kernel/hardwall.c 2017-11-12 19:46:13.000000000 +0100
1701 +++ linux-4.14/arch/tile/kernel/hardwall.c 2018-09-05 11:05:07.000000000 +0200
1702 @@ -590,12 +590,12 @@
1703 * Get our affinity; if we're not bound to this tile uniquely,
1704 * we can't access the network registers.
1706 - if (cpumask_weight(&p->cpus_allowed) != 1)
1707 + if (p->nr_cpus_allowed != 1)
1710 /* Make sure we are bound to a cpu assigned to this resource. */
1711 cpu = smp_processor_id();
1712 - BUG_ON(cpumask_first(&p->cpus_allowed) != cpu);
1713 + BUG_ON(cpumask_first(p->cpus_ptr) != cpu);
1714 if (!cpumask_test_cpu(cpu, &info->cpumask))
1717 @@ -621,17 +621,17 @@
1718 * Deactivate a task's hardwall. Must hold lock for hardwall_type.
1719 * This method may be called from exit_thread(), so we don't want to
1720 * rely on too many fields of struct task_struct still being valid.
1721 - * We assume the cpus_allowed, pid, and comm fields are still valid.
1722 + * We assume the nr_cpus_allowed, pid, and comm fields are still valid.
1724 static void _hardwall_deactivate(struct hardwall_type *hwt,
1725 struct task_struct *task)
1727 struct thread_struct *ts = &task->thread;
1729 - if (cpumask_weight(&task->cpus_allowed) != 1) {
1730 + if (task->nr_cpus_allowed != 1) {
1731 pr_err("pid %d (%s) releasing %s hardwall with an affinity mask containing %d cpus!\n",
1732 task->pid, task->comm, hwt->name,
1733 - cpumask_weight(&task->cpus_allowed));
1734 + task->nr_cpus_allowed);
1738 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/aesni-intel_glue.c linux-4.14/arch/x86/crypto/aesni-intel_glue.c
1739 --- linux-4.14.orig/arch/x86/crypto/aesni-intel_glue.c 2018-09-05 11:03:20.000000000 +0200
1740 +++ linux-4.14/arch/x86/crypto/aesni-intel_glue.c 2018-09-05 11:05:07.000000000 +0200
1741 @@ -387,14 +387,14 @@
1743 err = skcipher_walk_virt(&walk, req, true);
1745 - kernel_fpu_begin();
1746 while ((nbytes = walk.nbytes)) {
1747 + kernel_fpu_begin();
1748 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1749 nbytes & AES_BLOCK_MASK);
1751 nbytes &= AES_BLOCK_SIZE - 1;
1752 err = skcipher_walk_done(&walk, nbytes);
1758 @@ -409,14 +409,14 @@
1760 err = skcipher_walk_virt(&walk, req, true);
1762 - kernel_fpu_begin();
1763 while ((nbytes = walk.nbytes)) {
1764 + kernel_fpu_begin();
1765 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1766 nbytes & AES_BLOCK_MASK);
1768 nbytes &= AES_BLOCK_SIZE - 1;
1769 err = skcipher_walk_done(&walk, nbytes);
1775 @@ -431,14 +431,14 @@
1777 err = skcipher_walk_virt(&walk, req, true);
1779 - kernel_fpu_begin();
1780 while ((nbytes = walk.nbytes)) {
1781 + kernel_fpu_begin();
1782 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1783 nbytes & AES_BLOCK_MASK, walk.iv);
1785 nbytes &= AES_BLOCK_SIZE - 1;
1786 err = skcipher_walk_done(&walk, nbytes);
1792 @@ -453,14 +453,14 @@
1794 err = skcipher_walk_virt(&walk, req, true);
1796 - kernel_fpu_begin();
1797 while ((nbytes = walk.nbytes)) {
1798 + kernel_fpu_begin();
1799 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1800 nbytes & AES_BLOCK_MASK, walk.iv);
1802 nbytes &= AES_BLOCK_SIZE - 1;
1803 err = skcipher_walk_done(&walk, nbytes);
1809 @@ -510,18 +510,20 @@
1811 err = skcipher_walk_virt(&walk, req, true);
1813 - kernel_fpu_begin();
1814 while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
1815 + kernel_fpu_begin();
1816 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1817 nbytes & AES_BLOCK_MASK, walk.iv);
1819 nbytes &= AES_BLOCK_SIZE - 1;
1820 err = skcipher_walk_done(&walk, nbytes);
1823 + kernel_fpu_begin();
1824 ctr_crypt_final(ctx, &walk);
1826 err = skcipher_walk_done(&walk, 0);
1832 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/camellia_aesni_avx2_glue.c linux-4.14/arch/x86/crypto/camellia_aesni_avx2_glue.c
1833 --- linux-4.14.orig/arch/x86/crypto/camellia_aesni_avx2_glue.c 2017-11-12 19:46:13.000000000 +0100
1834 +++ linux-4.14/arch/x86/crypto/camellia_aesni_avx2_glue.c 2018-09-05 11:05:07.000000000 +0200
1835 @@ -206,6 +206,20 @@
1839 +#ifdef CONFIG_PREEMPT_RT_FULL
1840 +static void camellia_fpu_end_rt(struct crypt_priv *ctx)
1842 + bool fpu_enabled = ctx->fpu_enabled;
1846 + camellia_fpu_end(fpu_enabled);
1847 + ctx->fpu_enabled = false;
1850 +static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
1853 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
1855 const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
1856 @@ -221,16 +235,19 @@
1859 if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
1860 + kernel_fpu_resched();
1861 camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
1862 srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
1863 nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
1866 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
1867 + kernel_fpu_resched();
1868 camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
1869 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
1870 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
1872 + camellia_fpu_end_rt(ctx);
1874 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
1875 camellia_enc_blk(ctx->ctx, srcdst, srcdst);
1876 @@ -251,16 +268,19 @@
1879 if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
1880 + kernel_fpu_resched();
1881 camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
1882 srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
1883 nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
1886 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
1887 + kernel_fpu_resched();
1888 camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
1889 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
1890 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
1892 + camellia_fpu_end_rt(ctx);
1894 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
1895 camellia_dec_blk(ctx->ctx, srcdst, srcdst);
1896 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/camellia_aesni_avx_glue.c linux-4.14/arch/x86/crypto/camellia_aesni_avx_glue.c
1897 --- linux-4.14.orig/arch/x86/crypto/camellia_aesni_avx_glue.c 2017-11-12 19:46:13.000000000 +0100
1898 +++ linux-4.14/arch/x86/crypto/camellia_aesni_avx_glue.c 2018-09-05 11:05:07.000000000 +0200
1899 @@ -210,6 +210,21 @@
1903 +#ifdef CONFIG_PREEMPT_RT_FULL
1904 +static void camellia_fpu_end_rt(struct crypt_priv *ctx)
1906 + bool fpu_enabled = ctx->fpu_enabled;
1910 + camellia_fpu_end(fpu_enabled);
1911 + ctx->fpu_enabled = false;
1915 +static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
1918 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
1920 const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
1921 @@ -225,10 +240,12 @@
1924 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
1925 + kernel_fpu_resched();
1926 camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
1927 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
1928 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
1930 + camellia_fpu_end_rt(ctx);
1932 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
1933 camellia_enc_blk(ctx->ctx, srcdst, srcdst);
1934 @@ -249,10 +266,12 @@
1937 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
1938 + kernel_fpu_resched();
1939 camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
1940 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
1941 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
1943 + camellia_fpu_end_rt(ctx);
1945 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
1946 camellia_dec_blk(ctx->ctx, srcdst, srcdst);
1947 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/cast5_avx_glue.c linux-4.14/arch/x86/crypto/cast5_avx_glue.c
1948 --- linux-4.14.orig/arch/x86/crypto/cast5_avx_glue.c 2018-09-05 11:03:20.000000000 +0200
1949 +++ linux-4.14/arch/x86/crypto/cast5_avx_glue.c 2018-09-05 11:05:07.000000000 +0200
1951 static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1954 - bool fpu_enabled = false;
1956 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1957 const unsigned int bsize = CAST5_BLOCK_SIZE;
1958 unsigned int nbytes;
1960 u8 *wsrc = walk->src.virt.addr;
1961 u8 *wdst = walk->dst.virt.addr;
1963 - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1964 + fpu_enabled = cast5_fpu_begin(false, nbytes);
1966 /* Process multi-block batch */
1967 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
1968 @@ -102,10 +102,9 @@
1969 } while (nbytes >= bsize);
1972 + cast5_fpu_end(fpu_enabled);
1973 err = blkcipher_walk_done(desc, walk, nbytes);
1976 - cast5_fpu_end(fpu_enabled);
1981 static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1982 struct scatterlist *src, unsigned int nbytes)
1984 - bool fpu_enabled = false;
1986 struct blkcipher_walk walk;
1989 @@ -235,12 +234,11 @@
1990 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1992 while ((nbytes = walk.nbytes)) {
1993 - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1994 + fpu_enabled = cast5_fpu_begin(false, nbytes);
1995 nbytes = __cbc_decrypt(desc, &walk);
1996 + cast5_fpu_end(fpu_enabled);
1997 err = blkcipher_walk_done(desc, &walk, nbytes);
2000 - cast5_fpu_end(fpu_enabled);
2005 static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2006 struct scatterlist *src, unsigned int nbytes)
2008 - bool fpu_enabled = false;
2010 struct blkcipher_walk walk;
2013 @@ -318,13 +316,12 @@
2014 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2016 while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
2017 - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
2018 + fpu_enabled = cast5_fpu_begin(false, nbytes);
2019 nbytes = __ctr_crypt(desc, &walk);
2020 + cast5_fpu_end(fpu_enabled);
2021 err = blkcipher_walk_done(desc, &walk, nbytes);
2024 - cast5_fpu_end(fpu_enabled);
2027 ctr_crypt_final(desc, &walk);
2028 err = blkcipher_walk_done(desc, &walk, 0);
2029 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/cast6_avx_glue.c linux-4.14/arch/x86/crypto/cast6_avx_glue.c
2030 --- linux-4.14.orig/arch/x86/crypto/cast6_avx_glue.c 2017-11-12 19:46:13.000000000 +0100
2031 +++ linux-4.14/arch/x86/crypto/cast6_avx_glue.c 2018-09-05 11:05:07.000000000 +0200
2032 @@ -205,19 +205,33 @@
2036 +#ifdef CONFIG_PREEMPT_RT_FULL
2037 +static void cast6_fpu_end_rt(struct crypt_priv *ctx)
2039 + bool fpu_enabled = ctx->fpu_enabled;
2043 + cast6_fpu_end(fpu_enabled);
2044 + ctx->fpu_enabled = false;
2048 +static void cast6_fpu_end_rt(struct crypt_priv *ctx) { }
2051 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
2053 const unsigned int bsize = CAST6_BLOCK_SIZE;
2054 struct crypt_priv *ctx = priv;
2057 - ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
2059 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
2060 + ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
2061 cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
2062 + cast6_fpu_end_rt(ctx);
2066 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
2067 __cast6_encrypt(ctx->ctx, srcdst, srcdst);
2069 @@ -228,10 +242,10 @@
2070 struct crypt_priv *ctx = priv;
2073 - ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
2075 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
2076 + ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
2077 cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
2078 + cast6_fpu_end_rt(ctx);
2082 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/chacha20_glue.c linux-4.14/arch/x86/crypto/chacha20_glue.c
2083 --- linux-4.14.orig/arch/x86/crypto/chacha20_glue.c 2017-11-12 19:46:13.000000000 +0100
2084 +++ linux-4.14/arch/x86/crypto/chacha20_glue.c 2018-09-05 11:05:07.000000000 +0200
2087 crypto_chacha20_init(state, ctx, walk.iv);
2089 - kernel_fpu_begin();
2091 while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
2092 + kernel_fpu_begin();
2094 chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
2095 rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
2097 err = skcipher_walk_done(&walk,
2098 walk.nbytes % CHACHA20_BLOCK_SIZE);
2102 + kernel_fpu_begin();
2103 chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
2106 err = skcipher_walk_done(&walk, 0);
2114 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/glue_helper.c linux-4.14/arch/x86/crypto/glue_helper.c
2115 --- linux-4.14.orig/arch/x86/crypto/glue_helper.c 2017-11-12 19:46:13.000000000 +0100
2116 +++ linux-4.14/arch/x86/crypto/glue_helper.c 2018-09-05 11:05:07.000000000 +0200
2118 void *ctx = crypto_blkcipher_ctx(desc->tfm);
2119 const unsigned int bsize = 128 / 8;
2120 unsigned int nbytes, i, func_bytes;
2121 - bool fpu_enabled = false;
2125 err = blkcipher_walk_virt(desc, walk);
2127 u8 *wdst = walk->dst.virt.addr;
2129 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2130 - desc, fpu_enabled, nbytes);
2131 + desc, false, nbytes);
2133 for (i = 0; i < gctx->num_funcs; i++) {
2134 func_bytes = bsize * gctx->funcs[i].num_blocks;
2139 + glue_fpu_end(fpu_enabled);
2140 err = blkcipher_walk_done(desc, walk, nbytes);
2143 - glue_fpu_end(fpu_enabled);
2148 struct scatterlist *src, unsigned int nbytes)
2150 const unsigned int bsize = 128 / 8;
2151 - bool fpu_enabled = false;
2153 struct blkcipher_walk walk;
2156 @@ -201,12 +201,12 @@
2158 while ((nbytes = walk.nbytes)) {
2159 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2160 - desc, fpu_enabled, nbytes);
2161 + desc, false, nbytes);
2162 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
2163 + glue_fpu_end(fpu_enabled);
2164 err = blkcipher_walk_done(desc, &walk, nbytes);
2167 - glue_fpu_end(fpu_enabled);
2170 EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
2172 struct scatterlist *src, unsigned int nbytes)
2174 const unsigned int bsize = 128 / 8;
2175 - bool fpu_enabled = false;
2177 struct blkcipher_walk walk;
2180 @@ -284,13 +284,12 @@
2182 while ((nbytes = walk.nbytes) >= bsize) {
2183 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2184 - desc, fpu_enabled, nbytes);
2185 + desc, false, nbytes);
2186 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
2187 + glue_fpu_end(fpu_enabled);
2188 err = blkcipher_walk_done(desc, &walk, nbytes);
2191 - glue_fpu_end(fpu_enabled);
2194 glue_ctr_crypt_final_128bit(
2195 gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
2197 void *tweak_ctx, void *crypt_ctx)
2199 const unsigned int bsize = 128 / 8;
2200 - bool fpu_enabled = false;
2202 struct blkcipher_walk walk;
2205 @@ -393,21 +392,21 @@
2207 /* set minimum length to bsize, for tweak_fn */
2208 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2209 - desc, fpu_enabled,
2211 nbytes < bsize ? bsize : nbytes);
2213 /* calculate first value of T */
2214 tweak_fn(tweak_ctx, walk.iv, walk.iv);
2215 + glue_fpu_end(fpu_enabled);
2218 + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2219 + desc, false, nbytes);
2220 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
2222 + glue_fpu_end(fpu_enabled);
2223 err = blkcipher_walk_done(desc, &walk, nbytes);
2224 nbytes = walk.nbytes;
2227 - glue_fpu_end(fpu_enabled);
2231 EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
2232 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/serpent_avx2_glue.c linux-4.14/arch/x86/crypto/serpent_avx2_glue.c
2233 --- linux-4.14.orig/arch/x86/crypto/serpent_avx2_glue.c 2017-11-12 19:46:13.000000000 +0100
2234 +++ linux-4.14/arch/x86/crypto/serpent_avx2_glue.c 2018-09-05 11:05:07.000000000 +0200
2235 @@ -184,6 +184,21 @@
2239 +#ifdef CONFIG_PREEMPT_RT_FULL
2240 +static void serpent_fpu_end_rt(struct crypt_priv *ctx)
2242 + bool fpu_enabled = ctx->fpu_enabled;
2246 + serpent_fpu_end(fpu_enabled);
2247 + ctx->fpu_enabled = false;
2251 +static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
2254 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
2256 const unsigned int bsize = SERPENT_BLOCK_SIZE;
2257 @@ -199,10 +214,12 @@
2260 while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
2261 + kernel_fpu_resched();
2262 serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
2263 srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
2264 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
2266 + serpent_fpu_end_rt(ctx);
2268 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
2269 __serpent_encrypt(ctx->ctx, srcdst, srcdst);
2270 @@ -223,10 +240,12 @@
2273 while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
2274 + kernel_fpu_resched();
2275 serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
2276 srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
2277 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
2279 + serpent_fpu_end_rt(ctx);
2281 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
2282 __serpent_decrypt(ctx->ctx, srcdst, srcdst);
2283 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/serpent_avx_glue.c linux-4.14/arch/x86/crypto/serpent_avx_glue.c
2284 --- linux-4.14.orig/arch/x86/crypto/serpent_avx_glue.c 2017-11-12 19:46:13.000000000 +0100
2285 +++ linux-4.14/arch/x86/crypto/serpent_avx_glue.c 2018-09-05 11:05:07.000000000 +0200
2286 @@ -218,16 +218,31 @@
2290 +#ifdef CONFIG_PREEMPT_RT_FULL
2291 +static void serpent_fpu_end_rt(struct crypt_priv *ctx)
2293 + bool fpu_enabled = ctx->fpu_enabled;
2297 + serpent_fpu_end(fpu_enabled);
2298 + ctx->fpu_enabled = false;
2302 +static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
2305 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
2307 const unsigned int bsize = SERPENT_BLOCK_SIZE;
2308 struct crypt_priv *ctx = priv;
2311 - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2313 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
2314 + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2315 serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
2316 + serpent_fpu_end_rt(ctx);
2320 @@ -241,10 +256,10 @@
2321 struct crypt_priv *ctx = priv;
2324 - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2326 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
2327 + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2328 serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
2329 + serpent_fpu_end_rt(ctx);
2333 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/serpent_sse2_glue.c linux-4.14/arch/x86/crypto/serpent_sse2_glue.c
2334 --- linux-4.14.orig/arch/x86/crypto/serpent_sse2_glue.c 2017-11-12 19:46:13.000000000 +0100
2335 +++ linux-4.14/arch/x86/crypto/serpent_sse2_glue.c 2018-09-05 11:05:07.000000000 +0200
2336 @@ -187,16 +187,31 @@
2340 +#ifdef CONFIG_PREEMPT_RT_FULL
2341 +static void serpent_fpu_end_rt(struct crypt_priv *ctx)
2343 + bool fpu_enabled = ctx->fpu_enabled;
2347 + serpent_fpu_end(fpu_enabled);
2348 + ctx->fpu_enabled = false;
2352 +static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
2355 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
2357 const unsigned int bsize = SERPENT_BLOCK_SIZE;
2358 struct crypt_priv *ctx = priv;
2361 - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2363 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
2364 + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2365 serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
2366 + serpent_fpu_end_rt(ctx);
2370 @@ -210,10 +225,10 @@
2371 struct crypt_priv *ctx = priv;
2374 - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2376 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
2377 + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2378 serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
2379 + serpent_fpu_end_rt(ctx);
2383 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/twofish_avx_glue.c linux-4.14/arch/x86/crypto/twofish_avx_glue.c
2384 --- linux-4.14.orig/arch/x86/crypto/twofish_avx_glue.c 2017-11-12 19:46:13.000000000 +0100
2385 +++ linux-4.14/arch/x86/crypto/twofish_avx_glue.c 2018-09-05 11:05:07.000000000 +0200
2386 @@ -218,6 +218,21 @@
2390 +#ifdef CONFIG_PREEMPT_RT_FULL
2391 +static void twofish_fpu_end_rt(struct crypt_priv *ctx)
2393 + bool fpu_enabled = ctx->fpu_enabled;
2397 + twofish_fpu_end(fpu_enabled);
2398 + ctx->fpu_enabled = false;
2402 +static void twofish_fpu_end_rt(struct crypt_priv *ctx) { }
2405 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
2407 const unsigned int bsize = TF_BLOCK_SIZE;
2408 @@ -228,12 +243,16 @@
2410 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
2411 twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
2412 + twofish_fpu_end_rt(ctx);
2416 - for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
2417 + for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
2418 + kernel_fpu_resched();
2419 twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
2422 + twofish_fpu_end_rt(ctx);
2423 nbytes %= bsize * 3;
2425 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
2426 @@ -250,11 +269,15 @@
2428 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
2429 twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
2430 + twofish_fpu_end_rt(ctx);
2434 - for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
2435 + for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
2436 + kernel_fpu_resched();
2437 twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
2439 + twofish_fpu_end_rt(ctx);
2441 nbytes %= bsize * 3;
2443 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/entry/common.c linux-4.14/arch/x86/entry/common.c
2444 --- linux-4.14.orig/arch/x86/entry/common.c 2018-09-05 11:03:20.000000000 +0200
2445 +++ linux-4.14/arch/x86/entry/common.c 2018-09-05 11:05:07.000000000 +0200
2448 #define EXIT_TO_USERMODE_LOOP_FLAGS \
2449 (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
2450 - _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
2451 + _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
2453 static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
2455 @@ -148,9 +148,16 @@
2456 /* We have work to do. */
2459 - if (cached_flags & _TIF_NEED_RESCHED)
2460 + if (cached_flags & _TIF_NEED_RESCHED_MASK)
2463 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
2464 + if (unlikely(current->forced_info.si_signo)) {
2465 + struct task_struct *t = current;
2466 + force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
2467 + t->forced_info.si_signo = 0;
2470 if (cached_flags & _TIF_UPROBE)
2471 uprobe_notify_resume(regs);
2473 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/entry/entry_32.S linux-4.14/arch/x86/entry/entry_32.S
2474 --- linux-4.14.orig/arch/x86/entry/entry_32.S 2018-09-05 11:03:20.000000000 +0200
2475 +++ linux-4.14/arch/x86/entry/entry_32.S 2018-09-05 11:05:07.000000000 +0200
2476 @@ -350,8 +350,25 @@
2477 ENTRY(resume_kernel)
2478 DISABLE_INTERRUPTS(CLBR_ANY)
2480 + # preempt count == 0 + NEED_RS set?
2481 cmpl $0, PER_CPU_VAR(__preempt_count)
2482 +#ifndef CONFIG_PREEMPT_LAZY
2487 + # atleast preempt count == 0 ?
2488 + cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2491 + movl PER_CPU_VAR(current_task), %ebp
2492 + cmpl $0,TASK_TI_preempt_lazy_count(%ebp) # non-zero preempt_lazy_count ?
2495 + testl $_TIF_NEED_RESCHED_LAZY, TASK_TI_flags(%ebp)
2499 testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
2501 call preempt_schedule_irq
2502 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/entry/entry_64.S linux-4.14/arch/x86/entry/entry_64.S
2503 --- linux-4.14.orig/arch/x86/entry/entry_64.S 2018-09-05 11:03:20.000000000 +0200
2504 +++ linux-4.14/arch/x86/entry/entry_64.S 2018-09-05 11:05:07.000000000 +0200
2505 @@ -633,7 +633,23 @@
2506 bt $9, EFLAGS(%rsp) /* were interrupts off? */
2508 0: cmpl $0, PER_CPU_VAR(__preempt_count)
2509 +#ifndef CONFIG_PREEMPT_LAZY
2512 + jz do_preempt_schedule_irq
2514 + # atleast preempt count == 0 ?
2515 + cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2518 + movq PER_CPU_VAR(current_task), %rcx
2519 + cmpl $0, TASK_TI_preempt_lazy_count(%rcx)
2522 + bt $TIF_NEED_RESCHED_LAZY,TASK_TI_flags(%rcx)
2524 +do_preempt_schedule_irq:
2526 call preempt_schedule_irq
2529 @@ -988,6 +1004,7 @@
2533 +#ifndef CONFIG_PREEMPT_RT_FULL
2534 /* Call softirq on interrupt stack. Interrupts are off. */
2535 ENTRY(do_softirq_own_stack)
2537 @@ -998,6 +1015,7 @@
2540 ENDPROC(do_softirq_own_stack)
2544 idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
2545 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/include/asm/fpu/api.h linux-4.14/arch/x86/include/asm/fpu/api.h
2546 --- linux-4.14.orig/arch/x86/include/asm/fpu/api.h 2017-11-12 19:46:13.000000000 +0100
2547 +++ linux-4.14/arch/x86/include/asm/fpu/api.h 2018-09-05 11:05:07.000000000 +0200
2549 extern void __kernel_fpu_end(void);
2550 extern void kernel_fpu_begin(void);
2551 extern void kernel_fpu_end(void);
2552 +extern void kernel_fpu_resched(void);
2553 extern bool irq_fpu_usable(void);
2556 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/include/asm/preempt.h linux-4.14/arch/x86/include/asm/preempt.h
2557 --- linux-4.14.orig/arch/x86/include/asm/preempt.h 2017-11-12 19:46:13.000000000 +0100
2558 +++ linux-4.14/arch/x86/include/asm/preempt.h 2018-09-05 11:05:07.000000000 +0200
2560 * a decrement which hits zero means we have no preempt_count and should
2563 -static __always_inline bool __preempt_count_dec_and_test(void)
2564 +static __always_inline bool ____preempt_count_dec_and_test(void)
2566 GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
2569 +static __always_inline bool __preempt_count_dec_and_test(void)
2571 + if (____preempt_count_dec_and_test())
2573 +#ifdef CONFIG_PREEMPT_LAZY
2574 + if (current_thread_info()->preempt_lazy_count)
2576 + return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2583 * Returns true when we need to resched and can (barring IRQ state).
2585 static __always_inline bool should_resched(int preempt_offset)
2587 +#ifdef CONFIG_PREEMPT_LAZY
2590 + tmp = raw_cpu_read_4(__preempt_count);
2591 + if (tmp == preempt_offset)
2594 + /* preempt count == 0 ? */
2595 + tmp &= ~PREEMPT_NEED_RESCHED;
2598 + if (current_thread_info()->preempt_lazy_count)
2600 + return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2602 return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
2606 #ifdef CONFIG_PREEMPT
2607 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/include/asm/signal.h linux-4.14/arch/x86/include/asm/signal.h
2608 --- linux-4.14.orig/arch/x86/include/asm/signal.h 2017-11-12 19:46:13.000000000 +0100
2609 +++ linux-4.14/arch/x86/include/asm/signal.h 2018-09-05 11:05:07.000000000 +0200
2611 #define SA_IA32_ABI 0x02000000u
2612 #define SA_X32_ABI 0x01000000u
2615 + * Because some traps use the IST stack, we must keep preemption
2616 + * disabled while calling do_trap(), but do_trap() may call
2617 + * force_sig_info() which will grab the signal spin_locks for the
2618 + * task, which in PREEMPT_RT_FULL are mutexes. By defining
2619 + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
2620 + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
2623 +#if defined(CONFIG_PREEMPT_RT_FULL)
2624 +#define ARCH_RT_DELAYS_SIGNAL_SEND
2627 #ifndef CONFIG_COMPAT
2628 typedef sigset_t compat_sigset_t;
2630 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/include/asm/stackprotector.h linux-4.14/arch/x86/include/asm/stackprotector.h
2631 --- linux-4.14.orig/arch/x86/include/asm/stackprotector.h 2017-11-12 19:46:13.000000000 +0100
2632 +++ linux-4.14/arch/x86/include/asm/stackprotector.h 2018-09-05 11:05:07.000000000 +0200
2635 static __always_inline void boot_init_stack_canary(void)
2638 + u64 uninitialized_var(canary);
2641 #ifdef CONFIG_X86_64
2643 * of randomness. The TSC only matters for very early init,
2644 * there it already has some randomness on most systems. Later
2645 * on during the bootup the random pool has true entropy too.
2646 + * For preempt-rt we need to weaken the randomness a bit, as
2647 + * we can't call into the random generator from atomic context
2648 + * due to locking constraints. We just leave canary
2649 + * uninitialized and use the TSC based randomness on top of it.
2651 +#ifndef CONFIG_PREEMPT_RT_FULL
2652 get_random_bytes(&canary, sizeof(canary));
2655 canary += tsc + (tsc << 32UL);
2656 canary &= CANARY_MASK;
2657 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/include/asm/thread_info.h linux-4.14/arch/x86/include/asm/thread_info.h
2658 --- linux-4.14.orig/arch/x86/include/asm/thread_info.h 2018-09-05 11:03:20.000000000 +0200
2659 +++ linux-4.14/arch/x86/include/asm/thread_info.h 2018-09-05 11:05:07.000000000 +0200
2661 struct thread_info {
2662 unsigned long flags; /* low level flags */
2663 u32 status; /* thread synchronous flags */
2664 + int preempt_lazy_count; /* 0 => lazy preemptable
2668 #define INIT_THREAD_INFO(tsk) \
2671 + .preempt_lazy_count = 0, \
2674 #define init_stack (init_thread_union.stack)
2677 #include <asm/asm-offsets.h>
2679 +#define GET_THREAD_INFO(reg) \
2680 + _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
2681 + _ASM_SUB $(THREAD_SIZE),reg ;
2687 #define TIF_SYSCALL_EMU 6 /* syscall emulation active */
2688 #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
2689 #define TIF_SECCOMP 8 /* secure computing */
2690 +#define TIF_NEED_RESCHED_LAZY 9 /* lazy rescheduling necessary */
2691 #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
2692 #define TIF_UPROBE 12 /* breakpointed or singlestepping */
2693 #define TIF_PATCH_PENDING 13 /* pending live patching update */
2695 #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
2696 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
2697 #define _TIF_SECCOMP (1 << TIF_SECCOMP)
2698 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
2699 #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
2700 #define _TIF_UPROBE (1 << TIF_UPROBE)
2701 #define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING)
2703 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
2704 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
2706 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
2708 #define STACK_WARN (THREAD_SIZE/8)
2711 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/include/asm/uv/uv_bau.h linux-4.14/arch/x86/include/asm/uv/uv_bau.h
2712 --- linux-4.14.orig/arch/x86/include/asm/uv/uv_bau.h 2017-11-12 19:46:13.000000000 +0100
2713 +++ linux-4.14/arch/x86/include/asm/uv/uv_bau.h 2018-09-05 11:05:07.000000000 +0200
2715 cycles_t send_message;
2716 cycles_t period_end;
2717 cycles_t period_time;
2718 - spinlock_t uvhub_lock;
2719 - spinlock_t queue_lock;
2720 - spinlock_t disable_lock;
2721 + raw_spinlock_t uvhub_lock;
2722 + raw_spinlock_t queue_lock;
2723 + raw_spinlock_t disable_lock;
2726 int max_concurr_const;
2727 @@ -847,15 +847,15 @@
2728 * to be lowered below the current 'v'. atomic_add_unless can only stop
2731 -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
2732 +static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
2735 + raw_spin_lock(lock);
2736 if (atomic_read(v) >= u) {
2737 - spin_unlock(lock);
2738 + raw_spin_unlock(lock);
2742 - spin_unlock(lock);
2743 + raw_spin_unlock(lock);
2747 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/Kconfig linux-4.14/arch/x86/Kconfig
2748 --- linux-4.14.orig/arch/x86/Kconfig 2018-09-05 11:03:20.000000000 +0200
2749 +++ linux-4.14/arch/x86/Kconfig 2018-09-05 11:05:07.000000000 +0200
2751 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
2752 select HAVE_PERF_REGS
2753 select HAVE_PERF_USER_STACK_DUMP
2754 + select HAVE_PREEMPT_LAZY
2755 select HAVE_RCU_TABLE_FREE
2756 select HAVE_REGS_AND_STACK_ACCESS_API
2757 select HAVE_RELIABLE_STACKTRACE if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION
2758 @@ -256,8 +257,11 @@
2760 depends on ISA_DMA_API
2762 +config RWSEM_GENERIC_SPINLOCK
2763 + def_bool PREEMPT_RT_FULL
2765 config RWSEM_XCHGADD_ALGORITHM
2767 + def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
2769 config GENERIC_CALIBRATE_DELAY
2773 bool "Enable Maximum number of SMP Processors and NUMA Nodes"
2774 depends on X86_64 && SMP && DEBUG_KERNEL
2775 - select CPUMASK_OFFSTACK
2776 + select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
2778 Enable maximum number of CPUS and NUMA Nodes for this architecture.
2780 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/apic/io_apic.c linux-4.14/arch/x86/kernel/apic/io_apic.c
2781 --- linux-4.14.orig/arch/x86/kernel/apic/io_apic.c 2018-09-05 11:03:20.000000000 +0200
2782 +++ linux-4.14/arch/x86/kernel/apic/io_apic.c 2018-09-05 11:05:07.000000000 +0200
2783 @@ -1691,7 +1691,8 @@
2784 static inline bool ioapic_irqd_mask(struct irq_data *data)
2786 /* If we are moving the irq we need to mask it */
2787 - if (unlikely(irqd_is_setaffinity_pending(data))) {
2788 + if (unlikely(irqd_is_setaffinity_pending(data) &&
2789 + !irqd_irq_inprogress(data))) {
2790 mask_ioapic_irq(data);
2793 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/asm-offsets.c linux-4.14/arch/x86/kernel/asm-offsets.c
2794 --- linux-4.14.orig/arch/x86/kernel/asm-offsets.c 2018-09-05 11:03:20.000000000 +0200
2795 +++ linux-4.14/arch/x86/kernel/asm-offsets.c 2018-09-05 11:05:07.000000000 +0200
2799 OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
2800 + OFFSET(TASK_TI_preempt_lazy_count, task_struct, thread_info.preempt_lazy_count);
2801 OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
2807 DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
2808 + DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
2810 /* TLB state for the entry code */
2811 OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
2812 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/cpu/mcheck/dev-mcelog.c linux-4.14/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
2813 --- linux-4.14.orig/arch/x86/kernel/cpu/mcheck/dev-mcelog.c 2017-11-12 19:46:13.000000000 +0100
2814 +++ linux-4.14/arch/x86/kernel/cpu/mcheck/dev-mcelog.c 2018-09-05 11:05:07.000000000 +0200
2816 #include <linux/slab.h>
2817 #include <linux/kmod.h>
2818 #include <linux/poll.h>
2819 +#include <linux/swork.h>
2821 #include "mce-internal.h"
2825 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2828 -void mce_work_trigger(void)
2829 +static void __mce_work_trigger(struct swork_event *event)
2832 schedule_work(&mce_trigger_work);
2835 +#ifdef CONFIG_PREEMPT_RT_FULL
2836 +static bool notify_work_ready __read_mostly;
2837 +static struct swork_event notify_work;
2839 +static int mce_notify_work_init(void)
2843 + err = swork_get();
2847 + INIT_SWORK(¬ify_work, __mce_work_trigger);
2848 + notify_work_ready = true;
2852 +void mce_work_trigger(void)
2854 + if (notify_work_ready)
2855 + swork_queue(¬ify_work);
2859 +void mce_work_trigger(void)
2861 + __mce_work_trigger(NULL);
2863 +static inline int mce_notify_work_init(void) { return 0; }
2867 show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2874 + mce_notify_work_init();
2875 mce_register_decode_chain(&dev_mcelog_nb);
2878 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/cpu/mcheck/mce.c linux-4.14/arch/x86/kernel/cpu/mcheck/mce.c
2879 --- linux-4.14.orig/arch/x86/kernel/cpu/mcheck/mce.c 2018-09-05 11:03:20.000000000 +0200
2880 +++ linux-4.14/arch/x86/kernel/cpu/mcheck/mce.c 2018-09-05 11:05:07.000000000 +0200
2882 #include <linux/debugfs.h>
2883 #include <linux/irq_work.h>
2884 #include <linux/export.h>
2885 +#include <linux/jiffies.h>
2886 #include <linux/jump_label.h>
2888 #include <asm/intel-family.h>
2889 @@ -1365,7 +1366,7 @@
2890 static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
2892 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
2893 -static DEFINE_PER_CPU(struct timer_list, mce_timer);
2894 +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
2896 static unsigned long mce_adjust_timer_default(unsigned long interval)
2898 @@ -1374,27 +1375,19 @@
2900 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
2902 -static void __start_timer(struct timer_list *t, unsigned long interval)
2903 +static void __start_timer(struct hrtimer *t, unsigned long iv)
2905 - unsigned long when = jiffies + interval;
2906 - unsigned long flags;
2908 - local_irq_save(flags);
2910 - if (!timer_pending(t) || time_before(when, t->expires))
2911 - mod_timer(t, round_jiffies(when));
2915 - local_irq_restore(flags);
2916 + hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
2917 + 0, HRTIMER_MODE_REL_PINNED);
2920 -static void mce_timer_fn(unsigned long data)
2921 +static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
2923 - struct timer_list *t = this_cpu_ptr(&mce_timer);
2924 - int cpu = smp_processor_id();
2927 - WARN_ON(cpu != data);
2929 iv = __this_cpu_read(mce_next_interval);
2931 if (mce_available(this_cpu_ptr(&cpu_info))) {
2932 @@ -1417,7 +1410,11 @@
2935 __this_cpu_write(mce_next_interval, iv);
2936 - __start_timer(t, iv);
2938 + return HRTIMER_NORESTART;
2940 + hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(iv)));
2941 + return HRTIMER_RESTART;
2945 @@ -1425,7 +1422,7 @@
2947 void mce_timer_kick(unsigned long interval)
2949 - struct timer_list *t = this_cpu_ptr(&mce_timer);
2950 + struct hrtimer *t = this_cpu_ptr(&mce_timer);
2951 unsigned long iv = __this_cpu_read(mce_next_interval);
2953 __start_timer(t, interval);
2954 @@ -1440,7 +1437,7 @@
2957 for_each_online_cpu(cpu)
2958 - del_timer_sync(&per_cpu(mce_timer, cpu));
2959 + hrtimer_cancel(&per_cpu(mce_timer, cpu));
2963 @@ -1769,7 +1766,7 @@
2967 -static void mce_start_timer(struct timer_list *t)
2968 +static void mce_start_timer(struct hrtimer *t)
2970 unsigned long iv = check_interval * HZ;
2972 @@ -1782,18 +1779,19 @@
2974 static void __mcheck_cpu_setup_timer(void)
2976 - struct timer_list *t = this_cpu_ptr(&mce_timer);
2977 - unsigned int cpu = smp_processor_id();
2978 + struct hrtimer *t = this_cpu_ptr(&mce_timer);
2980 - setup_pinned_timer(t, mce_timer_fn, cpu);
2981 + hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2982 + t->function = mce_timer_fn;
2985 static void __mcheck_cpu_init_timer(void)
2987 - struct timer_list *t = this_cpu_ptr(&mce_timer);
2988 - unsigned int cpu = smp_processor_id();
2989 + struct hrtimer *t = this_cpu_ptr(&mce_timer);
2991 + hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2992 + t->function = mce_timer_fn;
2994 - setup_pinned_timer(t, mce_timer_fn, cpu);
2998 @@ -2309,7 +2307,7 @@
3000 static int mce_cpu_online(unsigned int cpu)
3002 - struct timer_list *t = this_cpu_ptr(&mce_timer);
3003 + struct hrtimer *t = this_cpu_ptr(&mce_timer);
3006 mce_device_create(cpu);
3007 @@ -2326,10 +2324,10 @@
3009 static int mce_cpu_pre_down(unsigned int cpu)
3011 - struct timer_list *t = this_cpu_ptr(&mce_timer);
3012 + struct hrtimer *t = this_cpu_ptr(&mce_timer);
3015 - del_timer_sync(t);
3016 + hrtimer_cancel(t);
3017 mce_threshold_remove_device(cpu);
3018 mce_device_remove(cpu);
3020 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/fpu/core.c linux-4.14/arch/x86/kernel/fpu/core.c
3021 --- linux-4.14.orig/arch/x86/kernel/fpu/core.c 2018-09-05 11:03:20.000000000 +0200
3022 +++ linux-4.14/arch/x86/kernel/fpu/core.c 2018-09-05 11:05:07.000000000 +0200
3023 @@ -138,6 +138,18 @@
3025 EXPORT_SYMBOL_GPL(kernel_fpu_end);
3027 +void kernel_fpu_resched(void)
3029 + WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
3031 + if (should_resched(PREEMPT_OFFSET)) {
3034 + kernel_fpu_begin();
3037 +EXPORT_SYMBOL_GPL(kernel_fpu_resched);
3040 * Save the FPU state (mark it for reload if necessary):
3042 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/irq_32.c linux-4.14/arch/x86/kernel/irq_32.c
3043 --- linux-4.14.orig/arch/x86/kernel/irq_32.c 2018-09-05 11:03:20.000000000 +0200
3044 +++ linux-4.14/arch/x86/kernel/irq_32.c 2018-09-05 11:05:07.000000000 +0200
3046 cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu));
3049 +#ifndef CONFIG_PREEMPT_RT_FULL
3050 void do_softirq_own_stack(void)
3052 struct irq_stack *irqstk;
3055 call_on_stack(__do_softirq, isp);
3059 bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
3061 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/process_32.c linux-4.14/arch/x86/kernel/process_32.c
3062 --- linux-4.14.orig/arch/x86/kernel/process_32.c 2018-09-05 11:03:20.000000000 +0200
3063 +++ linux-4.14/arch/x86/kernel/process_32.c 2018-09-05 11:05:07.000000000 +0200
3065 #include <linux/io.h>
3066 #include <linux/kdebug.h>
3067 #include <linux/syscalls.h>
3068 +#include <linux/highmem.h>
3070 #include <asm/pgtable.h>
3071 #include <asm/ldt.h>
3072 @@ -198,6 +199,35 @@
3074 EXPORT_SYMBOL_GPL(start_thread);
3076 +#ifdef CONFIG_PREEMPT_RT_FULL
3077 +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
3082 + * Clear @prev's kmap_atomic mappings
3084 + for (i = 0; i < prev_p->kmap_idx; i++) {
3085 + int idx = i + KM_TYPE_NR * smp_processor_id();
3086 + pte_t *ptep = kmap_pte - idx;
3088 + kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
3091 + * Restore @next_p's kmap_atomic mappings
3093 + for (i = 0; i < next_p->kmap_idx; i++) {
3094 + int idx = i + KM_TYPE_NR * smp_processor_id();
3096 + if (!pte_none(next_p->kmap_pte[i]))
3097 + set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
3102 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
3107 * switch_to(x,y) should switch tasks from x to y.
3109 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
3110 __switch_to_xtra(prev_p, next_p, tss);
3112 + switch_kmaps(prev_p, next_p);
3115 * Leave lazy mode, flushing any hypercalls made here.
3116 * This must be done before restoring TLS segments so
3117 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kvm/lapic.c linux-4.14/arch/x86/kvm/lapic.c
3118 --- linux-4.14.orig/arch/x86/kvm/lapic.c 2018-09-05 11:03:20.000000000 +0200
3119 +++ linux-4.14/arch/x86/kvm/lapic.c 2018-09-05 11:05:07.000000000 +0200
3120 @@ -2120,7 +2120,7 @@
3123 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
3124 - HRTIMER_MODE_ABS_PINNED);
3125 + HRTIMER_MODE_ABS_PINNED_HARD);
3126 apic->lapic_timer.timer.function = apic_timer_fn;
3129 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kvm/x86.c linux-4.14/arch/x86/kvm/x86.c
3130 --- linux-4.14.orig/arch/x86/kvm/x86.c 2018-09-05 11:03:20.000000000 +0200
3131 +++ linux-4.14/arch/x86/kvm/x86.c 2018-09-05 11:05:07.000000000 +0200
3132 @@ -6285,6 +6285,13 @@
3136 +#ifdef CONFIG_PREEMPT_RT_FULL
3137 + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3138 + printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
3139 + return -EOPNOTSUPP;
3143 r = kvm_mmu_module_init();
3145 goto out_free_percpu;
3146 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/mm/highmem_32.c linux-4.14/arch/x86/mm/highmem_32.c
3147 --- linux-4.14.orig/arch/x86/mm/highmem_32.c 2017-11-12 19:46:13.000000000 +0100
3148 +++ linux-4.14/arch/x86/mm/highmem_32.c 2018-09-05 11:05:07.000000000 +0200
3151 void *kmap_atomic_prot(struct page *page, pgprot_t prot)
3153 + pte_t pte = mk_pte(page, prot);
3154 unsigned long vaddr;
3157 - preempt_disable();
3158 + preempt_disable_nort();
3159 pagefault_disable();
3161 if (!PageHighMem(page))
3163 idx = type + KM_TYPE_NR*smp_processor_id();
3164 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
3165 BUG_ON(!pte_none(*(kmap_pte-idx)));
3166 - set_pte(kmap_pte-idx, mk_pte(page, prot));
3167 +#ifdef CONFIG_PREEMPT_RT_FULL
3168 + current->kmap_pte[type] = pte;
3170 + set_pte(kmap_pte-idx, pte);
3171 arch_flush_lazy_mmu_mode();
3173 return (void *)vaddr;
3175 * is a bad idea also, in case the page changes cacheability
3176 * attributes or becomes a protected page in a hypervisor.
3178 +#ifdef CONFIG_PREEMPT_RT_FULL
3179 + current->kmap_pte[type] = __pte(0);
3181 kpte_clear_flush(kmap_pte-idx, vaddr);
3182 kmap_atomic_idx_pop();
3183 arch_flush_lazy_mmu_mode();
3189 + preempt_enable_nort();
3191 EXPORT_SYMBOL(__kunmap_atomic);
3193 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/mm/iomap_32.c linux-4.14/arch/x86/mm/iomap_32.c
3194 --- linux-4.14.orig/arch/x86/mm/iomap_32.c 2017-11-12 19:46:13.000000000 +0100
3195 +++ linux-4.14/arch/x86/mm/iomap_32.c 2018-09-05 11:05:07.000000000 +0200
3198 void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
3200 + pte_t pte = pfn_pte(pfn, prot);
3201 unsigned long vaddr;
3205 type = kmap_atomic_idx_push();
3206 idx = type + KM_TYPE_NR * smp_processor_id();
3207 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
3208 - set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
3209 + WARN_ON(!pte_none(*(kmap_pte - idx)));
3211 +#ifdef CONFIG_PREEMPT_RT_FULL
3212 + current->kmap_pte[type] = pte;
3214 + set_pte(kmap_pte - idx, pte);
3215 arch_flush_lazy_mmu_mode();
3217 return (void *)vaddr;
3219 * is a bad idea also, in case the page changes cacheability
3220 * attributes or becomes a protected page in a hypervisor.
3222 +#ifdef CONFIG_PREEMPT_RT_FULL
3223 + current->kmap_pte[type] = __pte(0);
3225 kpte_clear_flush(kmap_pte-idx, vaddr);
3226 kmap_atomic_idx_pop();
3228 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/platform/uv/tlb_uv.c linux-4.14/arch/x86/platform/uv/tlb_uv.c
3229 --- linux-4.14.orig/arch/x86/platform/uv/tlb_uv.c 2018-09-05 11:03:20.000000000 +0200
3230 +++ linux-4.14/arch/x86/platform/uv/tlb_uv.c 2018-09-05 11:05:07.000000000 +0200
3233 quiesce_local_uvhub(hmaster);
3235 - spin_lock(&hmaster->queue_lock);
3236 + raw_spin_lock(&hmaster->queue_lock);
3237 reset_with_ipi(&bau_desc->distribution, bcp);
3238 - spin_unlock(&hmaster->queue_lock);
3239 + raw_spin_unlock(&hmaster->queue_lock);
3241 end_uvhub_quiesce(hmaster);
3245 quiesce_local_uvhub(hmaster);
3247 - spin_lock(&hmaster->queue_lock);
3248 + raw_spin_lock(&hmaster->queue_lock);
3249 reset_with_ipi(&bau_desc->distribution, bcp);
3250 - spin_unlock(&hmaster->queue_lock);
3251 + raw_spin_unlock(&hmaster->queue_lock);
3253 end_uvhub_quiesce(hmaster);
3258 hmaster = bcp->uvhub_master;
3259 - spin_lock(&hmaster->disable_lock);
3260 + raw_spin_lock(&hmaster->disable_lock);
3261 if (!bcp->baudisabled) {
3262 stat->s_bau_disabled++;
3268 - spin_unlock(&hmaster->disable_lock);
3269 + raw_spin_unlock(&hmaster->disable_lock);
3272 static void count_max_concurr(int stat, struct bau_control *bcp,
3275 static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
3277 - spinlock_t *lock = &hmaster->uvhub_lock;
3278 + raw_spinlock_t *lock = &hmaster->uvhub_lock;
3281 v = &hmaster->active_descriptor_count;
3283 struct bau_control *hmaster;
3285 hmaster = bcp->uvhub_master;
3286 - spin_lock(&hmaster->disable_lock);
3287 + raw_spin_lock(&hmaster->disable_lock);
3288 if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
3289 stat->s_bau_reenabled++;
3290 for_each_present_cpu(tcpu) {
3291 @@ -1007,10 +1007,10 @@
3292 tbcp->period_giveups = 0;
3295 - spin_unlock(&hmaster->disable_lock);
3296 + raw_spin_unlock(&hmaster->disable_lock);
3299 - spin_unlock(&hmaster->disable_lock);
3300 + raw_spin_unlock(&hmaster->disable_lock);
3304 @@ -1942,9 +1942,9 @@
3305 bcp->cong_reps = congested_reps;
3306 bcp->disabled_period = sec_2_cycles(disabled_period);
3307 bcp->giveup_limit = giveup_limit;
3308 - spin_lock_init(&bcp->queue_lock);
3309 - spin_lock_init(&bcp->uvhub_lock);
3310 - spin_lock_init(&bcp->disable_lock);
3311 + raw_spin_lock_init(&bcp->queue_lock);
3312 + raw_spin_lock_init(&bcp->uvhub_lock);
3313 + raw_spin_lock_init(&bcp->disable_lock);
3317 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/platform/uv/uv_time.c linux-4.14/arch/x86/platform/uv/uv_time.c
3318 --- linux-4.14.orig/arch/x86/platform/uv/uv_time.c 2017-11-12 19:46:13.000000000 +0100
3319 +++ linux-4.14/arch/x86/platform/uv/uv_time.c 2018-09-05 11:05:07.000000000 +0200
3322 /* There is one of these allocated per node */
3323 struct uv_rtc_timer_head {
3325 + raw_spinlock_t lock;
3326 /* next cpu waiting for timer, local node relative: */
3328 /* number of cpus on this node: */
3330 uv_rtc_deallocate_timers();
3333 - spin_lock_init(&head->lock);
3334 + raw_spin_lock_init(&head->lock);
3335 head->ncpus = uv_blade_nr_possible_cpus(bid);
3336 head->next_cpu = -1;
3337 blade_info[bid] = head;
3339 unsigned long flags;
3342 - spin_lock_irqsave(&head->lock, flags);
3343 + raw_spin_lock_irqsave(&head->lock, flags);
3345 next_cpu = head->next_cpu;
3347 @@ -243,12 +243,12 @@
3348 if (uv_setup_intr(cpu, expires)) {
3350 uv_rtc_find_next_timer(head, pnode);
3351 - spin_unlock_irqrestore(&head->lock, flags);
3352 + raw_spin_unlock_irqrestore(&head->lock, flags);
3357 - spin_unlock_irqrestore(&head->lock, flags);
3358 + raw_spin_unlock_irqrestore(&head->lock, flags);
3363 unsigned long flags;
3366 - spin_lock_irqsave(&head->lock, flags);
3367 + raw_spin_lock_irqsave(&head->lock, flags);
3369 if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
3372 uv_rtc_find_next_timer(head, pnode);
3375 - spin_unlock_irqrestore(&head->lock, flags);
3376 + raw_spin_unlock_irqrestore(&head->lock, flags);
3380 @@ -299,13 +299,17 @@
3381 static u64 uv_read_rtc(struct clocksource *cs)
3383 unsigned long offset;
3386 + preempt_disable();
3387 if (uv_get_min_hub_revision_id() == 1)
3390 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
3392 - return (u64)uv_read_local_mmr(UVH_RTC | offset);
3393 + cycles = (u64)uv_read_local_mmr(UVH_RTC | offset);
3399 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/xtensa/include/asm/spinlock_types.h linux-4.14/arch/xtensa/include/asm/spinlock_types.h
3400 --- linux-4.14.orig/arch/xtensa/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
3401 +++ linux-4.14/arch/xtensa/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
3403 #ifndef __ASM_SPINLOCK_TYPES_H
3404 #define __ASM_SPINLOCK_TYPES_H
3406 -#ifndef __LINUX_SPINLOCK_TYPES_H
3407 -# error "please don't include this file directly"
3411 volatile unsigned int slock;
3413 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/block/blk-core.c linux-4.14/block/blk-core.c
3414 --- linux-4.14.orig/block/blk-core.c 2018-09-05 11:03:20.000000000 +0200
3415 +++ linux-4.14/block/blk-core.c 2018-09-05 11:05:07.000000000 +0200
3418 INIT_LIST_HEAD(&rq->queuelist);
3419 INIT_LIST_HEAD(&rq->timeout_list);
3420 +#ifdef CONFIG_PREEMPT_RT_FULL
3421 + INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3425 rq->__sector = (sector_t) -1;
3427 void blk_start_queue(struct request_queue *q)
3429 lockdep_assert_held(q->queue_lock);
3430 - WARN_ON(!in_interrupt() && !irqs_disabled());
3431 + WARN_ON_NONRT(!in_interrupt() && !irqs_disabled());
3432 WARN_ON_ONCE(q->mq_ops);
3434 queue_flag_clear(QUEUE_FLAG_STOPPED, q);
3435 @@ -808,12 +811,21 @@
3436 percpu_ref_put(&q->q_usage_counter);
3439 +static void blk_queue_usage_counter_release_swork(struct swork_event *sev)
3441 + struct request_queue *q =
3442 + container_of(sev, struct request_queue, mq_pcpu_wake);
3444 + wake_up_all(&q->mq_freeze_wq);
3447 static void blk_queue_usage_counter_release(struct percpu_ref *ref)
3449 struct request_queue *q =
3450 container_of(ref, struct request_queue, q_usage_counter);
3452 - wake_up_all(&q->mq_freeze_wq);
3453 + if (wq_has_sleeper(&q->mq_freeze_wq))
3454 + swork_queue(&q->mq_pcpu_wake);
3457 static void blk_rq_timed_out_timer(unsigned long data)
3459 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
3461 init_waitqueue_head(&q->mq_freeze_wq);
3462 + INIT_SWORK(&q->mq_pcpu_wake, blk_queue_usage_counter_release_swork);
3465 * Init percpu_ref in atomic mode so that it's faster to shutdown.
3466 @@ -3308,7 +3321,7 @@
3467 blk_run_queue_async(q);
3470 - spin_unlock(q->queue_lock);
3471 + spin_unlock_irq(q->queue_lock);
3474 static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
3475 @@ -3356,7 +3369,6 @@
3476 void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3478 struct request_queue *q;
3479 - unsigned long flags;
3483 @@ -3376,11 +3388,6 @@
3488 - * Save and disable interrupts here, to avoid doing it for every
3489 - * queue lock we have to take.
3491 - local_irq_save(flags);
3492 while (!list_empty(&list)) {
3493 rq = list_entry_rq(list.next);
3494 list_del_init(&rq->queuelist);
3495 @@ -3393,7 +3400,7 @@
3496 queue_unplugged(q, depth, from_schedule);
3499 - spin_lock(q->queue_lock);
3500 + spin_lock_irq(q->queue_lock);
3504 @@ -3420,8 +3427,6 @@
3507 queue_unplugged(q, depth, from_schedule);
3509 - local_irq_restore(flags);
3512 void blk_finish_plug(struct blk_plug *plug)
3513 @@ -3631,6 +3636,8 @@
3514 if (!kblockd_workqueue)
3515 panic("Failed to create kblockd\n");
3517 + BUG_ON(swork_get());
3519 request_cachep = kmem_cache_create("blkdev_requests",
3520 sizeof(struct request), 0, SLAB_PANIC, NULL);
3522 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/block/blk-ioc.c linux-4.14/block/blk-ioc.c
3523 --- linux-4.14.orig/block/blk-ioc.c 2017-11-12 19:46:13.000000000 +0100
3524 +++ linux-4.14/block/blk-ioc.c 2018-09-05 11:05:07.000000000 +0200
3526 #include <linux/blkdev.h>
3527 #include <linux/slab.h>
3528 #include <linux/sched/task.h>
3529 +#include <linux/delay.h>
3534 spin_unlock(q->queue_lock);
3536 spin_unlock_irqrestore(&ioc->lock, flags);
3539 spin_lock_irqsave_nested(&ioc->lock, flags, 1);
3543 spin_unlock(icq->q->queue_lock);
3545 spin_unlock_irqrestore(&ioc->lock, flags);
3551 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/block/blk-mq.c linux-4.14/block/blk-mq.c
3552 --- linux-4.14.orig/block/blk-mq.c 2018-09-05 11:03:20.000000000 +0200
3553 +++ linux-4.14/block/blk-mq.c 2018-09-05 11:05:07.000000000 +0200
3555 /* tag was already set */
3558 +#ifdef CONFIG_PREEMPT_RT_FULL
3559 + INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3561 INIT_LIST_HEAD(&rq->timeout_list);
3564 @@ -533,12 +536,24 @@
3566 EXPORT_SYMBOL(blk_mq_end_request);
3568 +#ifdef CONFIG_PREEMPT_RT_FULL
3570 +void __blk_mq_complete_request_remote_work(struct work_struct *work)
3572 + struct request *rq = container_of(work, struct request, work);
3574 + rq->q->softirq_done_fn(rq);
3579 static void __blk_mq_complete_request_remote(void *data)
3581 struct request *rq = data;
3583 rq->q->softirq_done_fn(rq);
3587 static void __blk_mq_complete_request(struct request *rq)
3589 @@ -558,19 +573,27 @@
3594 + cpu = get_cpu_light();
3595 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
3596 shared = cpus_share_cache(cpu, ctx->cpu);
3598 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
3599 +#ifdef CONFIG_PREEMPT_RT_FULL
3601 + * We could force QUEUE_FLAG_SAME_FORCE then we would not get in
3602 + * here. But we could try to invoke it one the CPU like this.
3604 + schedule_work_on(ctx->cpu, &rq->work);
3606 rq->csd.func = __blk_mq_complete_request_remote;
3609 smp_call_function_single_async(ctx->cpu, &rq->csd);
3612 rq->q->softirq_done_fn(rq);
3619 @@ -1238,14 +1261,14 @@
3622 if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
3623 - int cpu = get_cpu();
3624 + int cpu = get_cpu_light();
3625 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
3626 __blk_mq_run_hw_queue(hctx);
3636 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
3637 @@ -2863,10 +2886,9 @@
3640 mode = HRTIMER_MODE_REL;
3641 - hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
3642 + hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode, current);
3643 hrtimer_set_expires(&hs.timer, kt);
3645 - hrtimer_init_sleeper(&hs, current);
3647 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
3649 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/block/blk-mq.h linux-4.14/block/blk-mq.h
3650 --- linux-4.14.orig/block/blk-mq.h 2018-09-05 11:03:20.000000000 +0200
3651 +++ linux-4.14/block/blk-mq.h 2018-09-05 11:05:07.000000000 +0200
3654 static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
3656 - return __blk_mq_get_ctx(q, get_cpu());
3657 + return __blk_mq_get_ctx(q, get_cpu_light());
3660 static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
3666 struct blk_mq_alloc_data {
3667 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/block/blk-softirq.c linux-4.14/block/blk-softirq.c
3668 --- linux-4.14.orig/block/blk-softirq.c 2017-11-12 19:46:13.000000000 +0100
3669 +++ linux-4.14/block/blk-softirq.c 2018-09-05 11:05:07.000000000 +0200
3671 raise_softirq_irqoff(BLOCK_SOFTIRQ);
3673 local_irq_restore(flags);
3674 + preempt_check_resched_rt();
3679 this_cpu_ptr(&blk_cpu_done));
3680 raise_softirq_irqoff(BLOCK_SOFTIRQ);
3682 + preempt_check_resched_rt();
3689 local_irq_restore(flags);
3690 + preempt_check_resched_rt();
3694 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/block/bounce.c linux-4.14/block/bounce.c
3695 --- linux-4.14.orig/block/bounce.c 2018-09-05 11:03:20.000000000 +0200
3696 +++ linux-4.14/block/bounce.c 2018-09-05 11:05:07.000000000 +0200
3698 unsigned long flags;
3701 - local_irq_save(flags);
3702 + local_irq_save_nort(flags);
3703 vto = kmap_atomic(to->bv_page);
3704 memcpy(vto + to->bv_offset, vfrom, to->bv_len);
3706 - local_irq_restore(flags);
3707 + local_irq_restore_nort(flags);
3710 #else /* CONFIG_HIGHMEM */
3711 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/crypto/algapi.c linux-4.14/crypto/algapi.c
3712 --- linux-4.14.orig/crypto/algapi.c 2018-09-05 11:03:20.000000000 +0200
3713 +++ linux-4.14/crypto/algapi.c 2018-09-05 11:05:07.000000000 +0200
3714 @@ -731,13 +731,13 @@
3716 int crypto_register_notifier(struct notifier_block *nb)
3718 - return blocking_notifier_chain_register(&crypto_chain, nb);
3719 + return srcu_notifier_chain_register(&crypto_chain, nb);
3721 EXPORT_SYMBOL_GPL(crypto_register_notifier);
3723 int crypto_unregister_notifier(struct notifier_block *nb)
3725 - return blocking_notifier_chain_unregister(&crypto_chain, nb);
3726 + return srcu_notifier_chain_unregister(&crypto_chain, nb);
3728 EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
3730 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/crypto/api.c linux-4.14/crypto/api.c
3731 --- linux-4.14.orig/crypto/api.c 2017-11-12 19:46:13.000000000 +0100
3732 +++ linux-4.14/crypto/api.c 2018-09-05 11:05:07.000000000 +0200
3734 DECLARE_RWSEM(crypto_alg_sem);
3735 EXPORT_SYMBOL_GPL(crypto_alg_sem);
3737 -BLOCKING_NOTIFIER_HEAD(crypto_chain);
3738 +SRCU_NOTIFIER_HEAD(crypto_chain);
3739 EXPORT_SYMBOL_GPL(crypto_chain);
3741 static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
3742 @@ -236,10 +236,10 @@
3746 - ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3747 + ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3748 if (ok == NOTIFY_DONE) {
3749 request_module("cryptomgr");
3750 - ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3751 + ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3755 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/crypto/internal.h linux-4.14/crypto/internal.h
3756 --- linux-4.14.orig/crypto/internal.h 2017-11-12 19:46:13.000000000 +0100
3757 +++ linux-4.14/crypto/internal.h 2018-09-05 11:05:07.000000000 +0200
3760 extern struct list_head crypto_alg_list;
3761 extern struct rw_semaphore crypto_alg_sem;
3762 -extern struct blocking_notifier_head crypto_chain;
3763 +extern struct srcu_notifier_head crypto_chain;
3765 #ifdef CONFIG_PROC_FS
3766 void __init crypto_init_proc(void);
3769 static inline void crypto_notify(unsigned long val, void *v)
3771 - blocking_notifier_call_chain(&crypto_chain, val, v);
3772 + srcu_notifier_call_chain(&crypto_chain, val, v);
3775 #endif /* _CRYPTO_INTERNAL_H */
3776 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/Documentation/trace/events.txt linux-4.14/Documentation/trace/events.txt
3777 --- linux-4.14.orig/Documentation/trace/events.txt 2017-11-12 19:46:13.000000000 +0100
3778 +++ linux-4.14/Documentation/trace/events.txt 2018-09-05 11:05:07.000000000 +0200
3779 @@ -517,1550 +517,4 @@
3780 totals derived from one or more trace event format fields and/or
3781 event counts (hitcount).
3783 - The format of a hist trigger is as follows:
3785 - hist:keys=<field1[,field2,...]>[:values=<field1[,field2,...]>]
3786 - [:sort=<field1[,field2,...]>][:size=#entries][:pause][:continue]
3787 - [:clear][:name=histname1] [if <filter>]
3789 - When a matching event is hit, an entry is added to a hash table
3790 - using the key(s) and value(s) named. Keys and values correspond to
3791 - fields in the event's format description. Values must correspond to
3792 - numeric fields - on an event hit, the value(s) will be added to a
3793 - sum kept for that field. The special string 'hitcount' can be used
3794 - in place of an explicit value field - this is simply a count of
3795 - event hits. If 'values' isn't specified, an implicit 'hitcount'
3796 - value will be automatically created and used as the only value.
3797 - Keys can be any field, or the special string 'stacktrace', which
3798 - will use the event's kernel stacktrace as the key. The keywords
3799 - 'keys' or 'key' can be used to specify keys, and the keywords
3800 - 'values', 'vals', or 'val' can be used to specify values. Compound
3801 - keys consisting of up to two fields can be specified by the 'keys'
3802 - keyword. Hashing a compound key produces a unique entry in the
3803 - table for each unique combination of component keys, and can be
3804 - useful for providing more fine-grained summaries of event data.
3805 - Additionally, sort keys consisting of up to two fields can be
3806 - specified by the 'sort' keyword. If more than one field is
3807 - specified, the result will be a 'sort within a sort': the first key
3808 - is taken to be the primary sort key and the second the secondary
3809 - key. If a hist trigger is given a name using the 'name' parameter,
3810 - its histogram data will be shared with other triggers of the same
3811 - name, and trigger hits will update this common data. Only triggers
3812 - with 'compatible' fields can be combined in this way; triggers are
3813 - 'compatible' if the fields named in the trigger share the same
3814 - number and type of fields and those fields also have the same names.
3815 - Note that any two events always share the compatible 'hitcount' and
3816 - 'stacktrace' fields and can therefore be combined using those
3817 - fields, however pointless that may be.
3819 - 'hist' triggers add a 'hist' file to each event's subdirectory.
3820 - Reading the 'hist' file for the event will dump the hash table in
3821 - its entirety to stdout. If there are multiple hist triggers
3822 - attached to an event, there will be a table for each trigger in the
3823 - output. The table displayed for a named trigger will be the same as
3824 - any other instance having the same name. Each printed hash table
3825 - entry is a simple list of the keys and values comprising the entry;
3826 - keys are printed first and are delineated by curly braces, and are
3827 - followed by the set of value fields for the entry. By default,
3828 - numeric fields are displayed as base-10 integers. This can be
3829 - modified by appending any of the following modifiers to the field
3832 - .hex display a number as a hex value
3833 - .sym display an address as a symbol
3834 - .sym-offset display an address as a symbol and offset
3835 - .syscall display a syscall id as a system call name
3836 - .execname display a common_pid as a program name
3838 - Note that in general the semantics of a given field aren't
3839 - interpreted when applying a modifier to it, but there are some
3840 - restrictions to be aware of in this regard:
3842 - - only the 'hex' modifier can be used for values (because values
3843 - are essentially sums, and the other modifiers don't make sense
3845 - - the 'execname' modifier can only be used on a 'common_pid'. The
3846 - reason for this is that the execname is simply the 'comm' value
3847 - saved for the 'current' process when an event was triggered,
3848 - which is the same as the common_pid value saved by the event
3849 - tracing code. Trying to apply that comm value to other pid
3850 - values wouldn't be correct, and typically events that care save
3851 - pid-specific comm fields in the event itself.
3853 - A typical usage scenario would be the following to enable a hist
3854 - trigger, read its current contents, and then turn it off:
3856 - # echo 'hist:keys=skbaddr.hex:vals=len' > \
3857 - /sys/kernel/debug/tracing/events/net/netif_rx/trigger
3859 - # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
3861 - # echo '!hist:keys=skbaddr.hex:vals=len' > \
3862 - /sys/kernel/debug/tracing/events/net/netif_rx/trigger
3864 - The trigger file itself can be read to show the details of the
3865 - currently attached hist trigger. This information is also displayed
3866 - at the top of the 'hist' file when read.
3868 - By default, the size of the hash table is 2048 entries. The 'size'
3869 - parameter can be used to specify more or fewer than that. The units
3870 - are in terms of hashtable entries - if a run uses more entries than
3871 - specified, the results will show the number of 'drops', the number
3872 - of hits that were ignored. The size should be a power of 2 between
3873 - 128 and 131072 (any non- power-of-2 number specified will be rounded
3876 - The 'sort' parameter can be used to specify a value field to sort
3877 - on. The default if unspecified is 'hitcount' and the default sort
3878 - order is 'ascending'. To sort in the opposite direction, append
3879 - .descending' to the sort key.
3881 - The 'pause' parameter can be used to pause an existing hist trigger
3882 - or to start a hist trigger but not log any events until told to do
3883 - so. 'continue' or 'cont' can be used to start or restart a paused
3886 - The 'clear' parameter will clear the contents of a running hist
3887 - trigger and leave its current paused/active state.
3889 - Note that the 'pause', 'cont', and 'clear' parameters should be
3890 - applied using 'append' shell operator ('>>') if applied to an
3891 - existing trigger, rather than via the '>' operator, which will cause
3892 - the trigger to be removed through truncation.
3894 -- enable_hist/disable_hist
3896 - The enable_hist and disable_hist triggers can be used to have one
3897 - event conditionally start and stop another event's already-attached
3898 - hist trigger. Any number of enable_hist and disable_hist triggers
3899 - can be attached to a given event, allowing that event to kick off
3900 - and stop aggregations on a host of other events.
3902 - The format is very similar to the enable/disable_event triggers:
3904 - enable_hist:<system>:<event>[:count]
3905 - disable_hist:<system>:<event>[:count]
3907 - Instead of enabling or disabling the tracing of the target event
3908 - into the trace buffer as the enable/disable_event triggers do, the
3909 - enable/disable_hist triggers enable or disable the aggregation of
3910 - the target event into a hash table.
3912 - A typical usage scenario for the enable_hist/disable_hist triggers
3913 - would be to first set up a paused hist trigger on some event,
3914 - followed by an enable_hist/disable_hist pair that turns the hist
3915 - aggregation on and off when conditions of interest are hit:
3917 - # echo 'hist:keys=skbaddr.hex:vals=len:pause' > \
3918 - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
3920 - # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
3921 - /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
3923 - # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
3924 - /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
3926 - The above sets up an initially paused hist trigger which is unpaused
3927 - and starts aggregating events when a given program is executed, and
3928 - which stops aggregating when the process exits and the hist trigger
3931 - The examples below provide a more concrete illustration of the
3932 - concepts and typical usage patterns discussed above.
3935 -6.2 'hist' trigger examples
3936 ----------------------------
3938 - The first set of examples creates aggregations using the kmalloc
3939 - event. The fields that can be used for the hist trigger are listed
3940 - in the kmalloc event's format file:
3942 - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/format
3946 - field:unsigned short common_type; offset:0; size:2; signed:0;
3947 - field:unsigned char common_flags; offset:2; size:1; signed:0;
3948 - field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
3949 - field:int common_pid; offset:4; size:4; signed:1;
3951 - field:unsigned long call_site; offset:8; size:8; signed:0;
3952 - field:const void * ptr; offset:16; size:8; signed:0;
3953 - field:size_t bytes_req; offset:24; size:8; signed:0;
3954 - field:size_t bytes_alloc; offset:32; size:8; signed:0;
3955 - field:gfp_t gfp_flags; offset:40; size:4; signed:0;
3957 - We'll start by creating a hist trigger that generates a simple table
3958 - that lists the total number of bytes requested for each function in
3959 - the kernel that made one or more calls to kmalloc:
3961 - # echo 'hist:key=call_site:val=bytes_req' > \
3962 - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
3964 - This tells the tracing system to create a 'hist' trigger using the
3965 - call_site field of the kmalloc event as the key for the table, which
3966 - just means that each unique call_site address will have an entry
3967 - created for it in the table. The 'val=bytes_req' parameter tells
3968 - the hist trigger that for each unique entry (call_site) in the
3969 - table, it should keep a running total of the number of bytes
3970 - requested by that call_site.
3972 - We'll let it run for awhile and then dump the contents of the 'hist'
3973 - file in the kmalloc event's subdirectory (for readability, a number
3974 - of entries have been omitted):
3976 - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
3977 - # trigger info: hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
3979 - { call_site: 18446744072106379007 } hitcount: 1 bytes_req: 176
3980 - { call_site: 18446744071579557049 } hitcount: 1 bytes_req: 1024
3981 - { call_site: 18446744071580608289 } hitcount: 1 bytes_req: 16384
3982 - { call_site: 18446744071581827654 } hitcount: 1 bytes_req: 24
3983 - { call_site: 18446744071580700980 } hitcount: 1 bytes_req: 8
3984 - { call_site: 18446744071579359876 } hitcount: 1 bytes_req: 152
3985 - { call_site: 18446744071580795365 } hitcount: 3 bytes_req: 144
3986 - { call_site: 18446744071581303129 } hitcount: 3 bytes_req: 144
3987 - { call_site: 18446744071580713234 } hitcount: 4 bytes_req: 2560
3988 - { call_site: 18446744071580933750 } hitcount: 4 bytes_req: 736
3992 - { call_site: 18446744072106047046 } hitcount: 69 bytes_req: 5576
3993 - { call_site: 18446744071582116407 } hitcount: 73 bytes_req: 2336
3994 - { call_site: 18446744072106054684 } hitcount: 136 bytes_req: 140504
3995 - { call_site: 18446744072106224230 } hitcount: 136 bytes_req: 19584
3996 - { call_site: 18446744072106078074 } hitcount: 153 bytes_req: 2448
3997 - { call_site: 18446744072106062406 } hitcount: 153 bytes_req: 36720
3998 - { call_site: 18446744071582507929 } hitcount: 153 bytes_req: 37088
3999 - { call_site: 18446744072102520590 } hitcount: 273 bytes_req: 10920
4000 - { call_site: 18446744071582143559 } hitcount: 358 bytes_req: 716
4001 - { call_site: 18446744072106465852 } hitcount: 417 bytes_req: 56712
4002 - { call_site: 18446744072102523378 } hitcount: 485 bytes_req: 27160
4003 - { call_site: 18446744072099568646 } hitcount: 1676 bytes_req: 33520
4010 - The output displays a line for each entry, beginning with the key
4011 - specified in the trigger, followed by the value(s) also specified in
4012 - the trigger. At the beginning of the output is a line that displays
4013 - the trigger info, which can also be displayed by reading the
4016 - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4017 - hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
4019 - At the end of the output are a few lines that display the overall
4020 - totals for the run. The 'Hits' field shows the total number of
4021 - times the event trigger was hit, the 'Entries' field shows the total
4022 - number of used entries in the hash table, and the 'Dropped' field
4023 - shows the number of hits that were dropped because the number of
4024 - used entries for the run exceeded the maximum number of entries
4025 - allowed for the table (normally 0, but if not a hint that you may
4026 - want to increase the size of the table using the 'size' parameter).
4028 - Notice in the above output that there's an extra field, 'hitcount',
4029 - which wasn't specified in the trigger. Also notice that in the
4030 - trigger info output, there's a parameter, 'sort=hitcount', which
4031 - wasn't specified in the trigger either. The reason for that is that
4032 - every trigger implicitly keeps a count of the total number of hits
4033 - attributed to a given entry, called the 'hitcount'. That hitcount
4034 - information is explicitly displayed in the output, and in the
4035 - absence of a user-specified sort parameter, is used as the default
4038 - The value 'hitcount' can be used in place of an explicit value in
4039 - the 'values' parameter if you don't really need to have any
4040 - particular field summed and are mainly interested in hit
4043 - To turn the hist trigger off, simply call up the trigger in the
4044 - command history and re-execute it with a '!' prepended:
4046 - # echo '!hist:key=call_site:val=bytes_req' > \
4047 - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4049 - Finally, notice that the call_site as displayed in the output above
4050 - isn't really very useful. It's an address, but normally addresses
4051 - are displayed in hex. To have a numeric field displayed as a hex
4052 - value, simply append '.hex' to the field name in the trigger:
4054 - # echo 'hist:key=call_site.hex:val=bytes_req' > \
4055 - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4057 - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
4058 - # trigger info: hist:keys=call_site.hex:vals=bytes_req:sort=hitcount:size=2048 [active]
4060 - { call_site: ffffffffa026b291 } hitcount: 1 bytes_req: 433
4061 - { call_site: ffffffffa07186ff } hitcount: 1 bytes_req: 176
4062 - { call_site: ffffffff811ae721 } hitcount: 1 bytes_req: 16384
4063 - { call_site: ffffffff811c5134 } hitcount: 1 bytes_req: 8
4064 - { call_site: ffffffffa04a9ebb } hitcount: 1 bytes_req: 511
4065 - { call_site: ffffffff8122e0a6 } hitcount: 1 bytes_req: 12
4066 - { call_site: ffffffff8107da84 } hitcount: 1 bytes_req: 152
4067 - { call_site: ffffffff812d8246 } hitcount: 1 bytes_req: 24
4068 - { call_site: ffffffff811dc1e5 } hitcount: 3 bytes_req: 144
4069 - { call_site: ffffffffa02515e8 } hitcount: 3 bytes_req: 648
4070 - { call_site: ffffffff81258159 } hitcount: 3 bytes_req: 144
4071 - { call_site: ffffffff811c80f4 } hitcount: 4 bytes_req: 544
4075 - { call_site: ffffffffa06c7646 } hitcount: 106 bytes_req: 8024
4076 - { call_site: ffffffffa06cb246 } hitcount: 132 bytes_req: 31680
4077 - { call_site: ffffffffa06cef7a } hitcount: 132 bytes_req: 2112
4078 - { call_site: ffffffff8137e399 } hitcount: 132 bytes_req: 23232
4079 - { call_site: ffffffffa06c941c } hitcount: 185 bytes_req: 171360
4080 - { call_site: ffffffffa06f2a66 } hitcount: 185 bytes_req: 26640
4081 - { call_site: ffffffffa036a70e } hitcount: 265 bytes_req: 10600
4082 - { call_site: ffffffff81325447 } hitcount: 292 bytes_req: 584
4083 - { call_site: ffffffffa072da3c } hitcount: 446 bytes_req: 60656
4084 - { call_site: ffffffffa036b1f2 } hitcount: 526 bytes_req: 29456
4085 - { call_site: ffffffffa0099c06 } hitcount: 1780 bytes_req: 35600
4092 - Even that's only marginally more useful - while hex values do look
4093 - more like addresses, what users are typically more interested in
4094 - when looking at text addresses are the corresponding symbols
4095 - instead. To have an address displayed as symbolic value instead,
4096 - simply append '.sym' or '.sym-offset' to the field name in the
4099 - # echo 'hist:key=call_site.sym:val=bytes_req' > \
4100 - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4102 - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
4103 - # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=hitcount:size=2048 [active]
4105 - { call_site: [ffffffff810adcb9] syslog_print_all } hitcount: 1 bytes_req: 1024
4106 - { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
4107 - { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
4108 - { call_site: [ffffffff8154acbe] usb_alloc_urb } hitcount: 1 bytes_req: 192
4109 - { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
4110 - { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
4111 - { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
4112 - { call_site: [ffffffff811febd5] fsnotify_alloc_group } hitcount: 2 bytes_req: 528
4113 - { call_site: [ffffffff81440f58] __tty_buffer_request_room } hitcount: 2 bytes_req: 2624
4114 - { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 2 bytes_req: 96
4115 - { call_site: [ffffffffa05e19af] ieee80211_start_tx_ba_session [mac80211] } hitcount: 2 bytes_req: 464
4116 - { call_site: [ffffffff81672406] tcp_get_metrics } hitcount: 2 bytes_req: 304
4117 - { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
4118 - { call_site: [ffffffff81089b05] sched_create_group } hitcount: 2 bytes_req: 1424
4122 - { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1185 bytes_req: 123240
4123 - { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 1185 bytes_req: 104280
4124 - { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 1402 bytes_req: 190672
4125 - { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 1518 bytes_req: 146208
4126 - { call_site: [ffffffffa029070e] drm_vma_node_allow [drm] } hitcount: 1746 bytes_req: 69840
4127 - { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 2021 bytes_req: 792312
4128 - { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 2592 bytes_req: 145152
4129 - { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2629 bytes_req: 378576
4130 - { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2629 bytes_req: 3783248
4131 - { call_site: [ffffffff81325607] apparmor_file_alloc_security } hitcount: 5192 bytes_req: 10384
4132 - { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 5529 bytes_req: 110584
4133 - { call_site: [ffffffff8131ebf7] aa_alloc_task_context } hitcount: 21943 bytes_req: 702176
4134 - { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 55759 bytes_req: 5074265
4141 - Because the default sort key above is 'hitcount', the above shows a
4142 - the list of call_sites by increasing hitcount, so that at the bottom
4143 - we see the functions that made the most kmalloc calls during the
4144 - run. If instead we we wanted to see the top kmalloc callers in
4145 - terms of the number of bytes requested rather than the number of
4146 - calls, and we wanted the top caller to appear at the top, we can use
4147 - the 'sort' parameter, along with the 'descending' modifier:
4149 - # echo 'hist:key=call_site.sym:val=bytes_req:sort=bytes_req.descending' > \
4150 - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4152 - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
4153 - # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
4155 - { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2186 bytes_req: 3397464
4156 - { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1790 bytes_req: 712176
4157 - { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 8132 bytes_req: 513135
4158 - { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 106 bytes_req: 440128
4159 - { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2186 bytes_req: 314784
4160 - { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 2174 bytes_req: 208992
4161 - { call_site: [ffffffff811ae8e1] __kmalloc } hitcount: 8 bytes_req: 131072
4162 - { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 859 bytes_req: 116824
4163 - { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 1834 bytes_req: 102704
4164 - { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 972 bytes_req: 101088
4165 - { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 972 bytes_req: 85536
4166 - { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 3333 bytes_req: 66664
4167 - { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 209 bytes_req: 61632
4171 - { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
4172 - { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
4173 - { call_site: [ffffffff812d8406] copy_semundo } hitcount: 2 bytes_req: 48
4174 - { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 1 bytes_req: 48
4175 - { call_site: [ffffffffa027121a] drm_getmagic [drm] } hitcount: 1 bytes_req: 48
4176 - { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
4177 - { call_site: [ffffffff811c52f4] bprm_change_interp } hitcount: 2 bytes_req: 16
4178 - { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
4179 - { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
4180 - { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
4187 - To display the offset and size information in addition to the symbol
4188 - name, just use 'sym-offset' instead:
4190 - # echo 'hist:key=call_site.sym-offset:val=bytes_req:sort=bytes_req.descending' > \
4191 - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4193 - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
4194 - # trigger info: hist:keys=call_site.sym-offset:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
4196 - { call_site: [ffffffffa046041c] i915_gem_execbuffer2+0x6c/0x2c0 [i915] } hitcount: 4569 bytes_req: 3163720
4197 - { call_site: [ffffffffa0489a66] intel_ring_begin+0xc6/0x1f0 [i915] } hitcount: 4569 bytes_req: 657936
4198 - { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23+0x694/0x1020 [i915] } hitcount: 1519 bytes_req: 472936
4199 - { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23+0x516/0x1020 [i915] } hitcount: 3050 bytes_req: 211832
4200 - { call_site: [ffffffff811e2a1b] seq_buf_alloc+0x1b/0x50 } hitcount: 34 bytes_req: 148384
4201 - { call_site: [ffffffffa04a580c] intel_crtc_page_flip+0xbc/0x870 [i915] } hitcount: 1385 bytes_req: 144040
4202 - { call_site: [ffffffff811ae8e1] __kmalloc+0x191/0x1b0 } hitcount: 8 bytes_req: 131072
4203 - { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl+0x282/0x360 [drm] } hitcount: 1385 bytes_req: 121880
4204 - { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc+0x32/0x100 [drm] } hitcount: 1848 bytes_req: 103488
4205 - { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state+0x2c/0xa0 [i915] } hitcount: 461 bytes_req: 62696
4206 - { call_site: [ffffffffa029070e] drm_vma_node_allow+0x2e/0xd0 [drm] } hitcount: 1541 bytes_req: 61640
4207 - { call_site: [ffffffff815f8d7b] sk_prot_alloc+0xcb/0x1b0 } hitcount: 57 bytes_req: 57456
4211 - { call_site: [ffffffff8109524a] alloc_fair_sched_group+0x5a/0x1a0 } hitcount: 2 bytes_req: 128
4212 - { call_site: [ffffffffa027b921] drm_vm_open_locked+0x31/0xa0 [drm] } hitcount: 3 bytes_req: 96
4213 - { call_site: [ffffffff8122e266] proc_self_follow_link+0x76/0xb0 } hitcount: 8 bytes_req: 96
4214 - { call_site: [ffffffff81213e80] load_elf_binary+0x240/0x1650 } hitcount: 3 bytes_req: 84
4215 - { call_site: [ffffffff8154bc62] usb_control_msg+0x42/0x110 } hitcount: 1 bytes_req: 8
4216 - { call_site: [ffffffffa00bf6fe] hidraw_send_report+0x7e/0x1a0 [hid] } hitcount: 1 bytes_req: 7
4217 - { call_site: [ffffffffa00bf1ca] hidraw_report_event+0x8a/0x120 [hid] } hitcount: 1 bytes_req: 7
4224 - We can also add multiple fields to the 'values' parameter. For
4225 - example, we might want to see the total number of bytes allocated
4226 - alongside bytes requested, and display the result sorted by bytes
4227 - allocated in a descending order:
4229 - # echo 'hist:keys=call_site.sym:values=bytes_req,bytes_alloc:sort=bytes_alloc.descending' > \
4230 - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4232 - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
4233 - # trigger info: hist:keys=call_site.sym:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending:size=2048 [active]
4235 - { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 7403 bytes_req: 4084360 bytes_alloc: 5958016
4236 - { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 541 bytes_req: 2213968 bytes_alloc: 2228224
4237 - { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 7404 bytes_req: 1066176 bytes_alloc: 1421568
4238 - { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1565 bytes_req: 557368 bytes_alloc: 1037760
4239 - { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 9557 bytes_req: 595778 bytes_alloc: 695744
4240 - { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 5839 bytes_req: 430680 bytes_alloc: 470400
4241 - { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 2388 bytes_req: 324768 bytes_alloc: 458496
4242 - { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 3911 bytes_req: 219016 bytes_alloc: 250304
4243 - { call_site: [ffffffff815f8d7b] sk_prot_alloc } hitcount: 235 bytes_req: 236880 bytes_alloc: 240640
4244 - { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 557 bytes_req: 169024 bytes_alloc: 221760
4245 - { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 9378 bytes_req: 187548 bytes_alloc: 206312
4246 - { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1519 bytes_req: 157976 bytes_alloc: 194432
4250 - { call_site: [ffffffff8109bd3b] sched_autogroup_create_attach } hitcount: 2 bytes_req: 144 bytes_alloc: 192
4251 - { call_site: [ffffffff81097ee8] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
4252 - { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
4253 - { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
4254 - { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
4255 - { call_site: [ffffffff81213e80] load_elf_binary } hitcount: 3 bytes_req: 84 bytes_alloc: 96
4256 - { call_site: [ffffffff81079a2e] kthread_create_on_node } hitcount: 1 bytes_req: 56 bytes_alloc: 64
4257 - { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
4258 - { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 bytes_alloc: 8
4259 - { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
4266 - Finally, to finish off our kmalloc example, instead of simply having
4267 - the hist trigger display symbolic call_sites, we can have the hist
4268 - trigger additionally display the complete set of kernel stack traces
4269 - that led to each call_site. To do that, we simply use the special
4270 - value 'stacktrace' for the key parameter:
4272 - # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \
4273 - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4275 - The above trigger will use the kernel stack trace in effect when an
4276 - event is triggered as the key for the hash table. This allows the
4277 - enumeration of every kernel callpath that led up to a particular
4278 - event, along with a running total of any of the event fields for
4279 - that event. Here we tally bytes requested and bytes allocated for
4280 - every callpath in the system that led up to a kmalloc (in this case
4281 - every callpath to a kmalloc for a kernel compile):
4283 - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
4284 - # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active]
4287 - __kmalloc_track_caller+0x10b/0x1a0
4289 - hidraw_report_event+0x8a/0x120 [hid]
4290 - hid_report_raw_event+0x3ea/0x440 [hid]
4291 - hid_input_report+0x112/0x190 [hid]
4292 - hid_irq_in+0xc2/0x260 [usbhid]
4293 - __usb_hcd_giveback_urb+0x72/0x120
4294 - usb_giveback_urb_bh+0x9e/0xe0
4295 - tasklet_hi_action+0xf8/0x100
4296 - __do_softirq+0x114/0x2c0
4297 - irq_exit+0xa5/0xb0
4299 - ret_from_intr+0x0/0x30
4300 - cpuidle_enter+0x17/0x20
4301 - cpu_startup_entry+0x315/0x3e0
4302 - rest_init+0x7c/0x80
4303 - } hitcount: 3 bytes_req: 21 bytes_alloc: 24
4305 - __kmalloc_track_caller+0x10b/0x1a0
4307 - hidraw_report_event+0x8a/0x120 [hid]
4308 - hid_report_raw_event+0x3ea/0x440 [hid]
4309 - hid_input_report+0x112/0x190 [hid]
4310 - hid_irq_in+0xc2/0x260 [usbhid]
4311 - __usb_hcd_giveback_urb+0x72/0x120
4312 - usb_giveback_urb_bh+0x9e/0xe0
4313 - tasklet_hi_action+0xf8/0x100
4314 - __do_softirq+0x114/0x2c0
4315 - irq_exit+0xa5/0xb0
4317 - ret_from_intr+0x0/0x30
4318 - } hitcount: 3 bytes_req: 21 bytes_alloc: 24
4320 - kmem_cache_alloc_trace+0xeb/0x150
4321 - aa_alloc_task_context+0x27/0x40
4322 - apparmor_cred_prepare+0x1f/0x50
4323 - security_prepare_creds+0x16/0x20
4324 - prepare_creds+0xdf/0x1a0
4325 - SyS_capset+0xb5/0x200
4326 - system_call_fastpath+0x12/0x6a
4327 - } hitcount: 1 bytes_req: 32 bytes_alloc: 32
4332 - __kmalloc+0x11b/0x1b0
4333 - i915_gem_execbuffer2+0x6c/0x2c0 [i915]
4334 - drm_ioctl+0x349/0x670 [drm]
4335 - do_vfs_ioctl+0x2f0/0x4f0
4336 - SyS_ioctl+0x81/0xa0
4337 - system_call_fastpath+0x12/0x6a
4338 - } hitcount: 17726 bytes_req: 13944120 bytes_alloc: 19593808
4340 - __kmalloc+0x11b/0x1b0
4341 - load_elf_phdrs+0x76/0xa0
4342 - load_elf_binary+0x102/0x1650
4343 - search_binary_handler+0x97/0x1d0
4344 - do_execveat_common.isra.34+0x551/0x6e0
4345 - SyS_execve+0x3a/0x50
4346 - return_from_execve+0x0/0x23
4347 - } hitcount: 33348 bytes_req: 17152128 bytes_alloc: 20226048
4349 - kmem_cache_alloc_trace+0xeb/0x150
4350 - apparmor_file_alloc_security+0x27/0x40
4351 - security_file_alloc+0x16/0x20
4352 - get_empty_filp+0x93/0x1c0
4353 - path_openat+0x31/0x5f0
4354 - do_filp_open+0x3a/0x90
4355 - do_sys_open+0x128/0x220
4356 - SyS_open+0x1e/0x20
4357 - system_call_fastpath+0x12/0x6a
4358 - } hitcount: 4766422 bytes_req: 9532844 bytes_alloc: 38131376
4360 - __kmalloc+0x11b/0x1b0
4361 - seq_buf_alloc+0x1b/0x50
4362 - seq_read+0x2cc/0x370
4363 - proc_reg_read+0x3d/0x80
4364 - __vfs_read+0x28/0xe0
4365 - vfs_read+0x86/0x140
4366 - SyS_read+0x46/0xb0
4367 - system_call_fastpath+0x12/0x6a
4368 - } hitcount: 19133 bytes_req: 78368768 bytes_alloc: 78368768
4375 - If you key a hist trigger on common_pid, in order for example to
4376 - gather and display sorted totals for each process, you can use the
4377 - special .execname modifier to display the executable names for the
4378 - processes in the table rather than raw pids. The example below
4379 - keeps a per-process sum of total bytes read:
4381 - # echo 'hist:key=common_pid.execname:val=count:sort=count.descending' > \
4382 - /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
4384 - # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/hist
4385 - # trigger info: hist:keys=common_pid.execname:vals=count:sort=count.descending:size=2048 [active]
4387 - { common_pid: gnome-terminal [ 3196] } hitcount: 280 count: 1093512
4388 - { common_pid: Xorg [ 1309] } hitcount: 525 count: 256640
4389 - { common_pid: compiz [ 2889] } hitcount: 59 count: 254400
4390 - { common_pid: bash [ 8710] } hitcount: 3 count: 66369
4391 - { common_pid: dbus-daemon-lau [ 8703] } hitcount: 49 count: 47739
4392 - { common_pid: irqbalance [ 1252] } hitcount: 27 count: 27648
4393 - { common_pid: 01ifupdown [ 8705] } hitcount: 3 count: 17216
4394 - { common_pid: dbus-daemon [ 772] } hitcount: 10 count: 12396
4395 - { common_pid: Socket Thread [ 8342] } hitcount: 11 count: 11264
4396 - { common_pid: nm-dhcp-client. [ 8701] } hitcount: 6 count: 7424
4397 - { common_pid: gmain [ 1315] } hitcount: 18 count: 6336
4401 - { common_pid: postgres [ 1892] } hitcount: 2 count: 32
4402 - { common_pid: postgres [ 1891] } hitcount: 2 count: 32
4403 - { common_pid: gmain [ 8704] } hitcount: 2 count: 32
4404 - { common_pid: upstart-dbus-br [ 2740] } hitcount: 21 count: 21
4405 - { common_pid: nm-dispatcher.a [ 8696] } hitcount: 1 count: 16
4406 - { common_pid: indicator-datet [ 2904] } hitcount: 1 count: 16
4407 - { common_pid: gdbus [ 2998] } hitcount: 1 count: 16
4408 - { common_pid: rtkit-daemon [ 2052] } hitcount: 1 count: 8
4409 - { common_pid: init [ 1] } hitcount: 2 count: 2
4416 - Similarly, if you key a hist trigger on syscall id, for example to
4417 - gather and display a list of systemwide syscall hits, you can use
4418 - the special .syscall modifier to display the syscall names rather
4419 - than raw ids. The example below keeps a running total of syscall
4420 - counts for the system during the run:
4422 - # echo 'hist:key=id.syscall:val=hitcount' > \
4423 - /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
4425 - # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
4426 - # trigger info: hist:keys=id.syscall:vals=hitcount:sort=hitcount:size=2048 [active]
4428 - { id: sys_fsync [ 74] } hitcount: 1
4429 - { id: sys_newuname [ 63] } hitcount: 1
4430 - { id: sys_prctl [157] } hitcount: 1
4431 - { id: sys_statfs [137] } hitcount: 1
4432 - { id: sys_symlink [ 88] } hitcount: 1
4433 - { id: sys_sendmmsg [307] } hitcount: 1
4434 - { id: sys_semctl [ 66] } hitcount: 1
4435 - { id: sys_readlink [ 89] } hitcount: 3
4436 - { id: sys_bind [ 49] } hitcount: 3
4437 - { id: sys_getsockname [ 51] } hitcount: 3
4438 - { id: sys_unlink [ 87] } hitcount: 3
4439 - { id: sys_rename [ 82] } hitcount: 4
4440 - { id: unknown_syscall [ 58] } hitcount: 4
4441 - { id: sys_connect [ 42] } hitcount: 4
4442 - { id: sys_getpid [ 39] } hitcount: 4
4446 - { id: sys_rt_sigprocmask [ 14] } hitcount: 952
4447 - { id: sys_futex [202] } hitcount: 1534
4448 - { id: sys_write [ 1] } hitcount: 2689
4449 - { id: sys_setitimer [ 38] } hitcount: 2797
4450 - { id: sys_read [ 0] } hitcount: 3202
4451 - { id: sys_select [ 23] } hitcount: 3773
4452 - { id: sys_writev [ 20] } hitcount: 4531
4453 - { id: sys_poll [ 7] } hitcount: 8314
4454 - { id: sys_recvmsg [ 47] } hitcount: 13738
4455 - { id: sys_ioctl [ 16] } hitcount: 21843
4462 - The syscall counts above provide a rough overall picture of system
4463 - call activity on the system; we can see for example that the most
4464 - popular system call on this system was the 'sys_ioctl' system call.
4466 - We can use 'compound' keys to refine that number and provide some
4467 - further insight as to which processes exactly contribute to the
4468 - overall ioctl count.
4470 - The command below keeps a hitcount for every unique combination of
4471 - system call id and pid - the end result is essentially a table
4472 - that keeps a per-pid sum of system call hits. The results are
4473 - sorted using the system call id as the primary key, and the
4474 - hitcount sum as the secondary key:
4476 - # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount' > \
4477 - /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
4479 - # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
4480 - # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 [active]
4482 - { id: sys_read [ 0], common_pid: rtkit-daemon [ 1877] } hitcount: 1
4483 - { id: sys_read [ 0], common_pid: gdbus [ 2976] } hitcount: 1
4484 - { id: sys_read [ 0], common_pid: console-kit-dae [ 3400] } hitcount: 1
4485 - { id: sys_read [ 0], common_pid: postgres [ 1865] } hitcount: 1
4486 - { id: sys_read [ 0], common_pid: deja-dup-monito [ 3543] } hitcount: 2
4487 - { id: sys_read [ 0], common_pid: NetworkManager [ 890] } hitcount: 2
4488 - { id: sys_read [ 0], common_pid: evolution-calen [ 3048] } hitcount: 2
4489 - { id: sys_read [ 0], common_pid: postgres [ 1864] } hitcount: 2
4490 - { id: sys_read [ 0], common_pid: nm-applet [ 3022] } hitcount: 2
4491 - { id: sys_read [ 0], common_pid: whoopsie [ 1212] } hitcount: 2
4495 - { id: sys_ioctl [ 16], common_pid: bash [ 8479] } hitcount: 1
4496 - { id: sys_ioctl [ 16], common_pid: bash [ 3472] } hitcount: 12
4497 - { id: sys_ioctl [ 16], common_pid: gnome-terminal [ 3199] } hitcount: 16
4498 - { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 1808
4499 - { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 5580
4503 - { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2690] } hitcount: 3
4504 - { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2688] } hitcount: 16
4505 - { id: sys_inotify_add_watch [254], common_pid: gmain [ 975] } hitcount: 2
4506 - { id: sys_inotify_add_watch [254], common_pid: gmain [ 3204] } hitcount: 4
4507 - { id: sys_inotify_add_watch [254], common_pid: gmain [ 2888] } hitcount: 4
4508 - { id: sys_inotify_add_watch [254], common_pid: gmain [ 3003] } hitcount: 4
4509 - { id: sys_inotify_add_watch [254], common_pid: gmain [ 2873] } hitcount: 4
4510 - { id: sys_inotify_add_watch [254], common_pid: gmain [ 3196] } hitcount: 6
4511 - { id: sys_openat [257], common_pid: java [ 2623] } hitcount: 2
4512 - { id: sys_eventfd2 [290], common_pid: ibus-ui-gtk3 [ 2760] } hitcount: 4
4513 - { id: sys_eventfd2 [290], common_pid: compiz [ 2994] } hitcount: 6
4520 - The above list does give us a breakdown of the ioctl syscall by
4521 - pid, but it also gives us quite a bit more than that, which we
4522 - don't really care about at the moment. Since we know the syscall
4523 - id for sys_ioctl (16, displayed next to the sys_ioctl name), we
4524 - can use that to filter out all the other syscalls:
4526 - # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount if id == 16' > \
4527 - /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
4529 - # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
4530 - # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 if id == 16 [active]
4532 - { id: sys_ioctl [ 16], common_pid: gmain [ 2769] } hitcount: 1
4533 - { id: sys_ioctl [ 16], common_pid: evolution-addre [ 8571] } hitcount: 1
4534 - { id: sys_ioctl [ 16], common_pid: gmain [ 3003] } hitcount: 1
4535 - { id: sys_ioctl [ 16], common_pid: gmain [ 2781] } hitcount: 1
4536 - { id: sys_ioctl [ 16], common_pid: gmain [ 2829] } hitcount: 1
4537 - { id: sys_ioctl [ 16], common_pid: bash [ 8726] } hitcount: 1
4538 - { id: sys_ioctl [ 16], common_pid: bash [ 8508] } hitcount: 1
4539 - { id: sys_ioctl [ 16], common_pid: gmain [ 2970] } hitcount: 1
4540 - { id: sys_ioctl [ 16], common_pid: gmain [ 2768] } hitcount: 1
4544 - { id: sys_ioctl [ 16], common_pid: pool [ 8559] } hitcount: 45
4545 - { id: sys_ioctl [ 16], common_pid: pool [ 8555] } hitcount: 48
4546 - { id: sys_ioctl [ 16], common_pid: pool [ 8551] } hitcount: 48
4547 - { id: sys_ioctl [ 16], common_pid: avahi-daemon [ 896] } hitcount: 66
4548 - { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 26674
4549 - { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 73443
4556 - The above output shows that 'compiz' and 'Xorg' are far and away
4557 - the heaviest ioctl callers (which might lead to questions about
4558 - whether they really need to be making all those calls and to
4559 - possible avenues for further investigation.)
4561 - The compound key examples used a key and a sum value (hitcount) to
4562 - sort the output, but we can just as easily use two keys instead.
4563 - Here's an example where we use a compound key composed of the the
4564 - common_pid and size event fields. Sorting with pid as the primary
4565 - key and 'size' as the secondary key allows us to display an
4566 - ordered summary of the recvfrom sizes, with counts, received by
4569 - # echo 'hist:key=common_pid.execname,size:val=hitcount:sort=common_pid,size' > \
4570 - /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/trigger
4572 - # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/hist
4573 - # trigger info: hist:keys=common_pid.execname,size:vals=hitcount:sort=common_pid.execname,size:size=2048 [active]
4575 - { common_pid: smbd [ 784], size: 4 } hitcount: 1
4576 - { common_pid: dnsmasq [ 1412], size: 4096 } hitcount: 672
4577 - { common_pid: postgres [ 1796], size: 1000 } hitcount: 6
4578 - { common_pid: postgres [ 1867], size: 1000 } hitcount: 10
4579 - { common_pid: bamfdaemon [ 2787], size: 28 } hitcount: 2
4580 - { common_pid: bamfdaemon [ 2787], size: 14360 } hitcount: 1
4581 - { common_pid: compiz [ 2994], size: 8 } hitcount: 1
4582 - { common_pid: compiz [ 2994], size: 20 } hitcount: 11
4583 - { common_pid: gnome-terminal [ 3199], size: 4 } hitcount: 2
4584 - { common_pid: firefox [ 8817], size: 4 } hitcount: 1
4585 - { common_pid: firefox [ 8817], size: 8 } hitcount: 5
4586 - { common_pid: firefox [ 8817], size: 588 } hitcount: 2
4587 - { common_pid: firefox [ 8817], size: 628 } hitcount: 1
4588 - { common_pid: firefox [ 8817], size: 6944 } hitcount: 1
4589 - { common_pid: firefox [ 8817], size: 408880 } hitcount: 2
4590 - { common_pid: firefox [ 8822], size: 8 } hitcount: 2
4591 - { common_pid: firefox [ 8822], size: 160 } hitcount: 2
4592 - { common_pid: firefox [ 8822], size: 320 } hitcount: 2
4593 - { common_pid: firefox [ 8822], size: 352 } hitcount: 1
4597 - { common_pid: pool [ 8923], size: 1960 } hitcount: 10
4598 - { common_pid: pool [ 8923], size: 2048 } hitcount: 10
4599 - { common_pid: pool [ 8924], size: 1960 } hitcount: 10
4600 - { common_pid: pool [ 8924], size: 2048 } hitcount: 10
4601 - { common_pid: pool [ 8928], size: 1964 } hitcount: 4
4602 - { common_pid: pool [ 8928], size: 1965 } hitcount: 2
4603 - { common_pid: pool [ 8928], size: 2048 } hitcount: 6
4604 - { common_pid: pool [ 8929], size: 1982 } hitcount: 1
4605 - { common_pid: pool [ 8929], size: 2048 } hitcount: 1
4612 - The above example also illustrates the fact that although a compound
4613 - key is treated as a single entity for hashing purposes, the sub-keys
4614 - it's composed of can be accessed independently.
4616 - The next example uses a string field as the hash key and
4617 - demonstrates how you can manually pause and continue a hist trigger.
4618 - In this example, we'll aggregate fork counts and don't expect a
4619 - large number of entries in the hash table, so we'll drop it to a
4620 - much smaller number, say 256:
4622 - # echo 'hist:key=child_comm:val=hitcount:size=256' > \
4623 - /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
4625 - # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
4626 - # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
4628 - { child_comm: dconf worker } hitcount: 1
4629 - { child_comm: ibus-daemon } hitcount: 1
4630 - { child_comm: whoopsie } hitcount: 1
4631 - { child_comm: smbd } hitcount: 1
4632 - { child_comm: gdbus } hitcount: 1
4633 - { child_comm: kthreadd } hitcount: 1
4634 - { child_comm: dconf worker } hitcount: 1
4635 - { child_comm: evolution-alarm } hitcount: 2
4636 - { child_comm: Socket Thread } hitcount: 2
4637 - { child_comm: postgres } hitcount: 2
4638 - { child_comm: bash } hitcount: 3
4639 - { child_comm: compiz } hitcount: 3
4640 - { child_comm: evolution-sourc } hitcount: 4
4641 - { child_comm: dhclient } hitcount: 4
4642 - { child_comm: pool } hitcount: 5
4643 - { child_comm: nm-dispatcher.a } hitcount: 8
4644 - { child_comm: firefox } hitcount: 8
4645 - { child_comm: dbus-daemon } hitcount: 8
4646 - { child_comm: glib-pacrunner } hitcount: 10
4647 - { child_comm: evolution } hitcount: 23
4654 - If we want to pause the hist trigger, we can simply append :pause to
4655 - the command that started the trigger. Notice that the trigger info
4656 - displays as [paused]:
4658 - # echo 'hist:key=child_comm:val=hitcount:size=256:pause' >> \
4659 - /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
4661 - # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
4662 - # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [paused]
4664 - { child_comm: dconf worker } hitcount: 1
4665 - { child_comm: kthreadd } hitcount: 1
4666 - { child_comm: dconf worker } hitcount: 1
4667 - { child_comm: gdbus } hitcount: 1
4668 - { child_comm: ibus-daemon } hitcount: 1
4669 - { child_comm: Socket Thread } hitcount: 2
4670 - { child_comm: evolution-alarm } hitcount: 2
4671 - { child_comm: smbd } hitcount: 2
4672 - { child_comm: bash } hitcount: 3
4673 - { child_comm: whoopsie } hitcount: 3
4674 - { child_comm: compiz } hitcount: 3
4675 - { child_comm: evolution-sourc } hitcount: 4
4676 - { child_comm: pool } hitcount: 5
4677 - { child_comm: postgres } hitcount: 6
4678 - { child_comm: firefox } hitcount: 8
4679 - { child_comm: dhclient } hitcount: 10
4680 - { child_comm: emacs } hitcount: 12
4681 - { child_comm: dbus-daemon } hitcount: 20
4682 - { child_comm: nm-dispatcher.a } hitcount: 20
4683 - { child_comm: evolution } hitcount: 35
4684 - { child_comm: glib-pacrunner } hitcount: 59
4691 - To manually continue having the trigger aggregate events, append
4692 - :cont instead. Notice that the trigger info displays as [active]
4693 - again, and the data has changed:
4695 - # echo 'hist:key=child_comm:val=hitcount:size=256:cont' >> \
4696 - /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
4698 - # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
4699 - # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
4701 - { child_comm: dconf worker } hitcount: 1
4702 - { child_comm: dconf worker } hitcount: 1
4703 - { child_comm: kthreadd } hitcount: 1
4704 - { child_comm: gdbus } hitcount: 1
4705 - { child_comm: ibus-daemon } hitcount: 1
4706 - { child_comm: Socket Thread } hitcount: 2
4707 - { child_comm: evolution-alarm } hitcount: 2
4708 - { child_comm: smbd } hitcount: 2
4709 - { child_comm: whoopsie } hitcount: 3
4710 - { child_comm: compiz } hitcount: 3
4711 - { child_comm: evolution-sourc } hitcount: 4
4712 - { child_comm: bash } hitcount: 5
4713 - { child_comm: pool } hitcount: 5
4714 - { child_comm: postgres } hitcount: 6
4715 - { child_comm: firefox } hitcount: 8
4716 - { child_comm: dhclient } hitcount: 11
4717 - { child_comm: emacs } hitcount: 12
4718 - { child_comm: dbus-daemon } hitcount: 22
4719 - { child_comm: nm-dispatcher.a } hitcount: 22
4720 - { child_comm: evolution } hitcount: 35
4721 - { child_comm: glib-pacrunner } hitcount: 59
4728 - The previous example showed how to start and stop a hist trigger by
4729 - appending 'pause' and 'continue' to the hist trigger command. A
4730 - hist trigger can also be started in a paused state by initially
4731 - starting the trigger with ':pause' appended. This allows you to
4732 - start the trigger only when you're ready to start collecting data
4733 - and not before. For example, you could start the trigger in a
4734 - paused state, then unpause it and do something you want to measure,
4735 - then pause the trigger again when done.
4737 - Of course, doing this manually can be difficult and error-prone, but
4738 - it is possible to automatically start and stop a hist trigger based
4739 - on some condition, via the enable_hist and disable_hist triggers.
4741 - For example, suppose we wanted to take a look at the relative
4742 - weights in terms of skb length for each callpath that leads to a
4743 - netif_receieve_skb event when downloading a decent-sized file using
4746 - First we set up an initially paused stacktrace trigger on the
4747 - netif_receive_skb event:
4749 - # echo 'hist:key=stacktrace:vals=len:pause' > \
4750 - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4752 - Next, we set up an 'enable_hist' trigger on the sched_process_exec
4753 - event, with an 'if filename==/usr/bin/wget' filter. The effect of
4754 - this new trigger is that it will 'unpause' the hist trigger we just
4755 - set up on netif_receive_skb if and only if it sees a
4756 - sched_process_exec event with a filename of '/usr/bin/wget'. When
4757 - that happens, all netif_receive_skb events are aggregated into a
4758 - hash table keyed on stacktrace:
4760 - # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
4761 - /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
4763 - The aggregation continues until the netif_receive_skb is paused
4764 - again, which is what the following disable_hist event does by
4765 - creating a similar setup on the sched_process_exit event, using the
4766 - filter 'comm==wget':
4768 - # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
4769 - /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
4771 - Whenever a process exits and the comm field of the disable_hist
4772 - trigger filter matches 'comm==wget', the netif_receive_skb hist
4773 - trigger is disabled.
4775 - The overall effect is that netif_receive_skb events are aggregated
4776 - into the hash table for only the duration of the wget. Executing a
4777 - wget command and then listing the 'hist' file will display the
4778 - output generated by the wget command:
4780 - $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
4782 - # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
4783 - # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
4786 - __netif_receive_skb_core+0x46d/0x990
4787 - __netif_receive_skb+0x18/0x60
4788 - netif_receive_skb_internal+0x23/0x90
4789 - napi_gro_receive+0xc8/0x100
4790 - ieee80211_deliver_skb+0xd6/0x270 [mac80211]
4791 - ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
4792 - ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
4793 - ieee80211_rx+0x31d/0x900 [mac80211]
4794 - iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
4795 - iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
4796 - iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
4797 - irq_thread_fn+0x20/0x50
4798 - irq_thread+0x11f/0x150
4800 - ret_from_fork+0x42/0x70
4801 - } hitcount: 85 len: 28884
4803 - __netif_receive_skb_core+0x46d/0x990
4804 - __netif_receive_skb+0x18/0x60
4805 - netif_receive_skb_internal+0x23/0x90
4806 - napi_gro_complete+0xa4/0xe0
4807 - dev_gro_receive+0x23a/0x360
4808 - napi_gro_receive+0x30/0x100
4809 - ieee80211_deliver_skb+0xd6/0x270 [mac80211]
4810 - ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
4811 - ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
4812 - ieee80211_rx+0x31d/0x900 [mac80211]
4813 - iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
4814 - iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
4815 - iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
4816 - irq_thread_fn+0x20/0x50
4817 - irq_thread+0x11f/0x150
4819 - } hitcount: 98 len: 664329
4821 - __netif_receive_skb_core+0x46d/0x990
4822 - __netif_receive_skb+0x18/0x60
4823 - process_backlog+0xa8/0x150
4824 - net_rx_action+0x15d/0x340
4825 - __do_softirq+0x114/0x2c0
4826 - do_softirq_own_stack+0x1c/0x30
4827 - do_softirq+0x65/0x70
4828 - __local_bh_enable_ip+0xb5/0xc0
4829 - ip_finish_output+0x1f4/0x840
4830 - ip_output+0x6b/0xc0
4831 - ip_local_out_sk+0x31/0x40
4832 - ip_send_skb+0x1a/0x50
4833 - udp_send_skb+0x173/0x2a0
4834 - udp_sendmsg+0x2bf/0x9f0
4835 - inet_sendmsg+0x64/0xa0
4836 - sock_sendmsg+0x3d/0x50
4837 - } hitcount: 115 len: 13030
4839 - __netif_receive_skb_core+0x46d/0x990
4840 - __netif_receive_skb+0x18/0x60
4841 - netif_receive_skb_internal+0x23/0x90
4842 - napi_gro_complete+0xa4/0xe0
4843 - napi_gro_flush+0x6d/0x90
4844 - iwl_pcie_irq_handler+0x92a/0x12f0 [iwlwifi]
4845 - irq_thread_fn+0x20/0x50
4846 - irq_thread+0x11f/0x150
4848 - ret_from_fork+0x42/0x70
4849 - } hitcount: 934 len: 5512212
4856 - The above shows all the netif_receive_skb callpaths and their total
4857 - lengths for the duration of the wget command.
4859 - The 'clear' hist trigger param can be used to clear the hash table.
4860 - Suppose we wanted to try another run of the previous example but
4861 - this time also wanted to see the complete list of events that went
4862 - into the histogram. In order to avoid having to set everything up
4863 - again, we can just clear the histogram first:
4865 - # echo 'hist:key=stacktrace:vals=len:clear' >> \
4866 - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4868 - Just to verify that it is in fact cleared, here's what we now see in
4871 - # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
4872 - # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
4879 - Since we want to see the detailed list of every netif_receive_skb
4880 - event occurring during the new run, which are in fact the same
4881 - events being aggregated into the hash table, we add some additional
4882 - 'enable_event' events to the triggering sched_process_exec and
4883 - sched_process_exit events as such:
4885 - # echo 'enable_event:net:netif_receive_skb if filename==/usr/bin/wget' > \
4886 - /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
4888 - # echo 'disable_event:net:netif_receive_skb if comm==wget' > \
4889 - /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
4891 - If you read the trigger files for the sched_process_exec and
4892 - sched_process_exit triggers, you should see two triggers for each:
4893 - one enabling/disabling the hist aggregation and the other
4894 - enabling/disabling the logging of events:
4896 - # cat /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
4897 - enable_event:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
4898 - enable_hist:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
4900 - # cat /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
4901 - enable_event:net:netif_receive_skb:unlimited if comm==wget
4902 - disable_hist:net:netif_receive_skb:unlimited if comm==wget
4904 - In other words, whenever either of the sched_process_exec or
4905 - sched_process_exit events is hit and matches 'wget', it enables or
4906 - disables both the histogram and the event log, and what you end up
4907 - with is a hash table and set of events just covering the specified
4908 - duration. Run the wget command again:
4910 - $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
4912 - Displaying the 'hist' file should show something similar to what you
4913 - saw in the last run, but this time you should also see the
4914 - individual events in the trace file:
4916 - # cat /sys/kernel/debug/tracing/trace
4920 - # entries-in-buffer/entries-written: 183/1426 #P:4
4922 - # _-----=> irqs-off
4923 - # / _----=> need-resched
4924 - # | / _---=> hardirq/softirq
4925 - # || / _--=> preempt-depth
4927 - # TASK-PID CPU# |||| TIMESTAMP FUNCTION
4929 - wget-15108 [000] ..s1 31769.606929: netif_receive_skb: dev=lo skbaddr=ffff88009c353100 len=60
4930 - wget-15108 [000] ..s1 31769.606999: netif_receive_skb: dev=lo skbaddr=ffff88009c353200 len=60
4931 - dnsmasq-1382 [000] ..s1 31769.677652: netif_receive_skb: dev=lo skbaddr=ffff88009c352b00 len=130
4932 - dnsmasq-1382 [000] ..s1 31769.685917: netif_receive_skb: dev=lo skbaddr=ffff88009c352200 len=138
4933 - ##### CPU 2 buffer started ####
4934 - irq/29-iwlwifi-559 [002] ..s. 31772.031529: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433d00 len=2948
4935 - irq/29-iwlwifi-559 [002] ..s. 31772.031572: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432200 len=1500
4936 - irq/29-iwlwifi-559 [002] ..s. 31772.032196: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433100 len=2948
4937 - irq/29-iwlwifi-559 [002] ..s. 31772.032761: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433000 len=2948
4938 - irq/29-iwlwifi-559 [002] ..s. 31772.033220: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432e00 len=1500
4943 - The following example demonstrates how multiple hist triggers can be
4944 - attached to a given event. This capability can be useful for
4945 - creating a set of different summaries derived from the same set of
4946 - events, or for comparing the effects of different filters, among
4949 - # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \
4950 - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4951 - # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \
4952 - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4953 - # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \
4954 - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4955 - # echo 'hist:keys=skbaddr.hex:vals=len' >> \
4956 - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4957 - # echo 'hist:keys=len:vals=common_preempt_count' >> \
4958 - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4960 - The above set of commands create four triggers differing only in
4961 - their filters, along with a completely different though fairly
4962 - nonsensical trigger. Note that in order to append multiple hist
4963 - triggers to the same file, you should use the '>>' operator to
4964 - append them ('>' will also add the new hist trigger, but will remove
4965 - any existing hist triggers beforehand).
4967 - Displaying the contents of the 'hist' file for the event shows the
4968 - contents of all five histograms:
4970 - # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
4974 - # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active]
4977 - { len: 176 } hitcount: 1 common_preempt_count: 0
4978 - { len: 223 } hitcount: 1 common_preempt_count: 0
4979 - { len: 4854 } hitcount: 1 common_preempt_count: 0
4980 - { len: 395 } hitcount: 1 common_preempt_count: 0
4981 - { len: 177 } hitcount: 1 common_preempt_count: 0
4982 - { len: 446 } hitcount: 1 common_preempt_count: 0
4983 - { len: 1601 } hitcount: 1 common_preempt_count: 0
4987 - { len: 1280 } hitcount: 66 common_preempt_count: 0
4988 - { len: 116 } hitcount: 81 common_preempt_count: 40
4989 - { len: 708 } hitcount: 112 common_preempt_count: 0
4990 - { len: 46 } hitcount: 221 common_preempt_count: 0
4991 - { len: 1264 } hitcount: 458 common_preempt_count: 0
5001 - # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
5004 - { skbaddr: ffff8800baee5e00 } hitcount: 1 len: 130
5005 - { skbaddr: ffff88005f3d5600 } hitcount: 1 len: 1280
5006 - { skbaddr: ffff88005f3d4900 } hitcount: 1 len: 1280
5007 - { skbaddr: ffff88009fed6300 } hitcount: 1 len: 115
5008 - { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 115
5009 - { skbaddr: ffff88008cdb1900 } hitcount: 1 len: 46
5010 - { skbaddr: ffff880064b5ef00 } hitcount: 1 len: 118
5011 - { skbaddr: ffff880044e3c700 } hitcount: 1 len: 60
5012 - { skbaddr: ffff880100065900 } hitcount: 1 len: 46
5013 - { skbaddr: ffff8800d46bd500 } hitcount: 1 len: 116
5014 - { skbaddr: ffff88005f3d5f00 } hitcount: 1 len: 1280
5015 - { skbaddr: ffff880100064700 } hitcount: 1 len: 365
5016 - { skbaddr: ffff8800badb6f00 } hitcount: 1 len: 60
5020 - { skbaddr: ffff88009fe0be00 } hitcount: 27 len: 24677
5021 - { skbaddr: ffff88009fe0a400 } hitcount: 27 len: 23052
5022 - { skbaddr: ffff88009fe0b700 } hitcount: 31 len: 25589
5023 - { skbaddr: ffff88009fe0b600 } hitcount: 32 len: 27326
5024 - { skbaddr: ffff88006a462800 } hitcount: 68 len: 71678
5025 - { skbaddr: ffff88006a463700 } hitcount: 70 len: 72678
5026 - { skbaddr: ffff88006a462b00 } hitcount: 71 len: 77589
5027 - { skbaddr: ffff88006a463600 } hitcount: 73 len: 71307
5028 - { skbaddr: ffff88006a462200 } hitcount: 81 len: 81032
5038 - # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active]
5050 - # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active]
5053 - { skbaddr: ffff88009fd2c300 } hitcount: 1 len: 7212
5054 - { skbaddr: ffff8800d2bcce00 } hitcount: 1 len: 7212
5055 - { skbaddr: ffff8800d2bcd700 } hitcount: 1 len: 7212
5056 - { skbaddr: ffff8800d2bcda00 } hitcount: 1 len: 21492
5057 - { skbaddr: ffff8800ae2e2d00 } hitcount: 1 len: 7212
5058 - { skbaddr: ffff8800d2bcdb00 } hitcount: 1 len: 7212
5059 - { skbaddr: ffff88006a4df500 } hitcount: 1 len: 4854
5060 - { skbaddr: ffff88008ce47b00 } hitcount: 1 len: 18636
5061 - { skbaddr: ffff8800ae2e2200 } hitcount: 1 len: 12924
5062 - { skbaddr: ffff88005f3e1000 } hitcount: 1 len: 4356
5063 - { skbaddr: ffff8800d2bcdc00 } hitcount: 2 len: 24420
5064 - { skbaddr: ffff8800d2bcc200 } hitcount: 2 len: 12996
5074 - # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active]
5083 - Named triggers can be used to have triggers share a common set of
5084 - histogram data. This capability is mostly useful for combining the
5085 - output of events generated by tracepoints contained inside inline
5086 - functions, but names can be used in a hist trigger on any event.
5087 - For example, these two triggers when hit will update the same 'len'
5088 - field in the shared 'foo' histogram data:
5090 - # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
5091 - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
5092 - # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
5093 - /sys/kernel/debug/tracing/events/net/netif_rx/trigger
5095 - You can see that they're updating common histogram data by reading
5096 - each event's hist files at the same time:
5098 - # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist;
5099 - cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
5103 - # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
5106 - { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
5107 - { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
5108 - { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
5109 - { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
5110 - { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
5111 - { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
5112 - { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
5113 - { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
5114 - { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
5115 - { skbaddr: ffff880064505000 } hitcount: 1 len: 46
5116 - { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
5117 - { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
5118 - { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
5119 - { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
5120 - { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
5121 - { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
5122 - { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
5123 - { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
5124 - { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
5125 - { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
5126 - { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
5127 - { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
5128 - { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
5129 - { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
5130 - { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
5131 - { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
5132 - { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
5133 - { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
5134 - { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
5135 - { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
5136 - { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
5137 - { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
5138 - { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
5139 - { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
5140 - { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
5141 - { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
5142 - { skbaddr: ffff880064504400 } hitcount: 4 len: 184
5143 - { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
5144 - { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
5145 - { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
5146 - { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
5147 - { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
5155 - # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
5158 - { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
5159 - { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
5160 - { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
5161 - { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
5162 - { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
5163 - { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
5164 - { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
5165 - { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
5166 - { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
5167 - { skbaddr: ffff880064505000 } hitcount: 1 len: 46
5168 - { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
5169 - { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
5170 - { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
5171 - { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
5172 - { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
5173 - { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
5174 - { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
5175 - { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
5176 - { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
5177 - { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
5178 - { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
5179 - { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
5180 - { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
5181 - { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
5182 - { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
5183 - { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
5184 - { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
5185 - { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
5186 - { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
5187 - { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
5188 - { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
5189 - { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
5190 - { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
5191 - { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
5192 - { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
5193 - { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
5194 - { skbaddr: ffff880064504400 } hitcount: 4 len: 184
5195 - { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
5196 - { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
5197 - { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
5198 - { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
5199 - { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
5206 - And here's an example that shows how to combine histogram data from
5207 - any two events even if they don't share any 'compatible' fields
5208 - other than 'hitcount' and 'stacktrace'. These commands create a
5209 - couple of triggers named 'bar' using those fields:
5211 - # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
5212 - /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
5213 - # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
5214 - /sys/kernel/debug/tracing/events/net/netif_rx/trigger
5216 - And displaying the output of either shows some interesting if
5217 - somewhat confusing output:
5219 - # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
5220 - # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
5224 - # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active]
5228 - _do_fork+0x18e/0x330
5229 - kernel_thread+0x29/0x30
5230 - kthreadd+0x154/0x1b0
5231 - ret_from_fork+0x3f/0x70
5234 - netif_rx_internal+0xb2/0xd0
5235 - netif_rx_ni+0x20/0x70
5236 - dev_loopback_xmit+0xaa/0xd0
5237 - ip_mc_output+0x126/0x240
5238 - ip_local_out_sk+0x31/0x40
5239 - igmp_send_report+0x1e9/0x230
5240 - igmp_timer_expire+0xe9/0x120
5241 - call_timer_fn+0x39/0xf0
5242 - run_timer_softirq+0x1e1/0x290
5243 - __do_softirq+0xfd/0x290
5244 - irq_exit+0x98/0xb0
5245 - smp_apic_timer_interrupt+0x4a/0x60
5246 - apic_timer_interrupt+0x6d/0x80
5247 - cpuidle_enter+0x17/0x20
5248 - call_cpuidle+0x3b/0x60
5249 - cpu_startup_entry+0x22d/0x310
5252 - netif_rx_internal+0xb2/0xd0
5253 - netif_rx_ni+0x20/0x70
5254 - dev_loopback_xmit+0xaa/0xd0
5255 - ip_mc_output+0x17f/0x240
5256 - ip_local_out_sk+0x31/0x40
5257 - ip_send_skb+0x1a/0x50
5258 - udp_send_skb+0x13e/0x270
5259 - udp_sendmsg+0x2bf/0x980
5260 - inet_sendmsg+0x67/0xa0
5261 - sock_sendmsg+0x38/0x50
5262 - SYSC_sendto+0xef/0x170
5263 - SyS_sendto+0xe/0x10
5264 - entry_SYSCALL_64_fastpath+0x12/0x6a
5267 - netif_rx_internal+0xb2/0xd0
5268 - netif_rx+0x1c/0x60
5269 - loopback_xmit+0x6c/0xb0
5270 - dev_hard_start_xmit+0x219/0x3a0
5271 - __dev_queue_xmit+0x415/0x4f0
5272 - dev_queue_xmit_sk+0x13/0x20
5273 - ip_finish_output2+0x237/0x340
5274 - ip_finish_output+0x113/0x1d0
5275 - ip_output+0x66/0xc0
5276 - ip_local_out_sk+0x31/0x40
5277 - ip_send_skb+0x1a/0x50
5278 - udp_send_skb+0x16d/0x270
5279 - udp_sendmsg+0x2bf/0x980
5280 - inet_sendmsg+0x67/0xa0
5281 - sock_sendmsg+0x38/0x50
5282 - ___sys_sendmsg+0x14e/0x270
5285 - netif_rx_internal+0xb2/0xd0
5286 - netif_rx+0x1c/0x60
5287 - loopback_xmit+0x6c/0xb0
5288 - dev_hard_start_xmit+0x219/0x3a0
5289 - __dev_queue_xmit+0x415/0x4f0
5290 - dev_queue_xmit_sk+0x13/0x20
5291 - ip_finish_output2+0x237/0x340
5292 - ip_finish_output+0x113/0x1d0
5293 - ip_output+0x66/0xc0
5294 - ip_local_out_sk+0x31/0x40
5295 - ip_send_skb+0x1a/0x50
5296 - udp_send_skb+0x16d/0x270
5297 - udp_sendmsg+0x2bf/0x980
5298 - inet_sendmsg+0x67/0xa0
5299 - sock_sendmsg+0x38/0x50
5300 - ___sys_sendmsg+0x269/0x270
5303 - netif_rx_internal+0xb2/0xd0
5304 - netif_rx+0x1c/0x60
5305 - loopback_xmit+0x6c/0xb0
5306 - dev_hard_start_xmit+0x219/0x3a0
5307 - __dev_queue_xmit+0x415/0x4f0
5308 - dev_queue_xmit_sk+0x13/0x20
5309 - ip_finish_output2+0x237/0x340
5310 - ip_finish_output+0x113/0x1d0
5311 - ip_output+0x66/0xc0
5312 - ip_local_out_sk+0x31/0x40
5313 - ip_send_skb+0x1a/0x50
5314 - udp_send_skb+0x16d/0x270
5315 - udp_sendmsg+0x2bf/0x980
5316 - inet_sendmsg+0x67/0xa0
5317 - sock_sendmsg+0x38/0x50
5318 - SYSC_sendto+0xef/0x170
5321 - _do_fork+0x18e/0x330
5322 - SyS_clone+0x19/0x20
5323 - entry_SYSCALL_64_fastpath+0x12/0x6a
5330 + See Documentation/trace/histogram.txt for details and examples.
5331 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/Documentation/trace/ftrace.txt linux-4.14/Documentation/trace/ftrace.txt
5332 --- linux-4.14.orig/Documentation/trace/ftrace.txt 2017-11-12 19:46:13.000000000 +0100
5333 +++ linux-4.14/Documentation/trace/ftrace.txt 2018-09-05 11:05:07.000000000 +0200
5334 @@ -539,6 +539,30 @@
5336 See events.txt for more information.
5340 + Certain tracers may change the timestamp mode used when
5341 + logging trace events into the event buffer. Events with
5342 + different modes can coexist within a buffer but the mode in
5343 + effect when an event is logged determines which timestamp mode
5344 + is used for that event. The default timestamp mode is
5347 + Usual timestamp modes for tracing:
5349 + # cat timestamp_mode
5352 + The timestamp mode with the square brackets around it is the
5355 + delta: Default timestamp mode - timestamp is a delta against
5356 + a per-buffer timestamp.
5358 + absolute: The timestamp is a full timestamp, not a delta
5359 + against some other value. As such it takes up more
5360 + space and is less efficient.
5364 Directory for the Hardware Latency Detector.
5365 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/Documentation/trace/histogram.txt linux-4.14/Documentation/trace/histogram.txt
5366 --- linux-4.14.orig/Documentation/trace/histogram.txt 1970-01-01 01:00:00.000000000 +0100
5367 +++ linux-4.14/Documentation/trace/histogram.txt 2018-09-05 11:05:07.000000000 +0200
5371 + Documentation written by Tom Zanussi
5376 + Histogram triggers are special event triggers that can be used to
5377 + aggregate trace event data into histograms. For information on
5378 + trace events and event triggers, see Documentation/trace/events.txt.
5381 +2. Histogram Trigger Command
5382 +============================
5384 + A histogram trigger command is an event trigger command that
5385 + aggregates event hits into a hash table keyed on one or more trace
5386 + event format fields (or stacktrace) and a set of running totals
5387 + derived from one or more trace event format fields and/or event
5388 + counts (hitcount).
5390 + The format of a hist trigger is as follows:
5392 + hist:keys=<field1[,field2,...]>[:values=<field1[,field2,...]>]
5393 + [:sort=<field1[,field2,...]>][:size=#entries][:pause][:continue]
5394 + [:clear][:name=histname1] [if <filter>]
5396 + When a matching event is hit, an entry is added to a hash table
5397 + using the key(s) and value(s) named. Keys and values correspond to
5398 + fields in the event's format description. Values must correspond to
5399 + numeric fields - on an event hit, the value(s) will be added to a
5400 + sum kept for that field. The special string 'hitcount' can be used
5401 + in place of an explicit value field - this is simply a count of
5402 + event hits. If 'values' isn't specified, an implicit 'hitcount'
5403 + value will be automatically created and used as the only value.
5404 + Keys can be any field, or the special string 'stacktrace', which
5405 + will use the event's kernel stacktrace as the key. The keywords
5406 + 'keys' or 'key' can be used to specify keys, and the keywords
5407 + 'values', 'vals', or 'val' can be used to specify values. Compound
5408 + keys consisting of up to two fields can be specified by the 'keys'
5409 + keyword. Hashing a compound key produces a unique entry in the
5410 + table for each unique combination of component keys, and can be
5411 + useful for providing more fine-grained summaries of event data.
5412 + Additionally, sort keys consisting of up to two fields can be
5413 + specified by the 'sort' keyword. If more than one field is
5414 + specified, the result will be a 'sort within a sort': the first key
5415 + is taken to be the primary sort key and the second the secondary
5416 + key. If a hist trigger is given a name using the 'name' parameter,
5417 + its histogram data will be shared with other triggers of the same
5418 + name, and trigger hits will update this common data. Only triggers
5419 + with 'compatible' fields can be combined in this way; triggers are
5420 + 'compatible' if the fields named in the trigger share the same
5421 + number and type of fields and those fields also have the same names.
5422 + Note that any two events always share the compatible 'hitcount' and
5423 + 'stacktrace' fields and can therefore be combined using those
5424 + fields, however pointless that may be.
5426 + 'hist' triggers add a 'hist' file to each event's subdirectory.
5427 + Reading the 'hist' file for the event will dump the hash table in
5428 + its entirety to stdout. If there are multiple hist triggers
5429 + attached to an event, there will be a table for each trigger in the
5430 + output. The table displayed for a named trigger will be the same as
5431 + any other instance having the same name. Each printed hash table
5432 + entry is a simple list of the keys and values comprising the entry;
5433 + keys are printed first and are delineated by curly braces, and are
5434 + followed by the set of value fields for the entry. By default,
5435 + numeric fields are displayed as base-10 integers. This can be
5436 + modified by appending any of the following modifiers to the field
5439 + .hex display a number as a hex value
5440 + .sym display an address as a symbol
5441 + .sym-offset display an address as a symbol and offset
5442 + .syscall display a syscall id as a system call name
5443 + .execname display a common_pid as a program name
5444 + .log2 display log2 value rather than raw number
5445 + .usecs display a common_timestamp in microseconds
5447 + Note that in general the semantics of a given field aren't
5448 + interpreted when applying a modifier to it, but there are some
5449 + restrictions to be aware of in this regard:
5451 + - only the 'hex' modifier can be used for values (because values
5452 + are essentially sums, and the other modifiers don't make sense
5454 + - the 'execname' modifier can only be used on a 'common_pid'. The
5455 + reason for this is that the execname is simply the 'comm' value
5456 + saved for the 'current' process when an event was triggered,
5457 + which is the same as the common_pid value saved by the event
5458 + tracing code. Trying to apply that comm value to other pid
5459 + values wouldn't be correct, and typically events that care save
5460 + pid-specific comm fields in the event itself.
5462 + A typical usage scenario would be the following to enable a hist
5463 + trigger, read its current contents, and then turn it off:
5465 + # echo 'hist:keys=skbaddr.hex:vals=len' > \
5466 + /sys/kernel/debug/tracing/events/net/netif_rx/trigger
5468 + # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
5470 + # echo '!hist:keys=skbaddr.hex:vals=len' > \
5471 + /sys/kernel/debug/tracing/events/net/netif_rx/trigger
5473 + The trigger file itself can be read to show the details of the
5474 + currently attached hist trigger. This information is also displayed
5475 + at the top of the 'hist' file when read.
5477 + By default, the size of the hash table is 2048 entries. The 'size'
5478 + parameter can be used to specify more or fewer than that. The units
5479 + are in terms of hashtable entries - if a run uses more entries than
5480 + specified, the results will show the number of 'drops', the number
5481 + of hits that were ignored. The size should be a power of 2 between
5482 + 128 and 131072 (any non- power-of-2 number specified will be rounded
5485 + The 'sort' parameter can be used to specify a value field to sort
5486 + on. The default if unspecified is 'hitcount' and the default sort
5487 + order is 'ascending'. To sort in the opposite direction, append
5488 + .descending' to the sort key.
5490 + The 'pause' parameter can be used to pause an existing hist trigger
5491 + or to start a hist trigger but not log any events until told to do
5492 + so. 'continue' or 'cont' can be used to start or restart a paused
5495 + The 'clear' parameter will clear the contents of a running hist
5496 + trigger and leave its current paused/active state.
5498 + Note that the 'pause', 'cont', and 'clear' parameters should be
5499 + applied using 'append' shell operator ('>>') if applied to an
5500 + existing trigger, rather than via the '>' operator, which will cause
5501 + the trigger to be removed through truncation.
5503 +- enable_hist/disable_hist
5505 + The enable_hist and disable_hist triggers can be used to have one
5506 + event conditionally start and stop another event's already-attached
5507 + hist trigger. Any number of enable_hist and disable_hist triggers
5508 + can be attached to a given event, allowing that event to kick off
5509 + and stop aggregations on a host of other events.
5511 + The format is very similar to the enable/disable_event triggers:
5513 + enable_hist:<system>:<event>[:count]
5514 + disable_hist:<system>:<event>[:count]
5516 + Instead of enabling or disabling the tracing of the target event
5517 + into the trace buffer as the enable/disable_event triggers do, the
5518 + enable/disable_hist triggers enable or disable the aggregation of
5519 + the target event into a hash table.
5521 + A typical usage scenario for the enable_hist/disable_hist triggers
5522 + would be to first set up a paused hist trigger on some event,
5523 + followed by an enable_hist/disable_hist pair that turns the hist
5524 + aggregation on and off when conditions of interest are hit:
5526 + # echo 'hist:keys=skbaddr.hex:vals=len:pause' > \
5527 + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
5529 + # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
5530 + /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
5532 + # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
5533 + /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
5535 + The above sets up an initially paused hist trigger which is unpaused
5536 + and starts aggregating events when a given program is executed, and
5537 + which stops aggregating when the process exits and the hist trigger
5540 + The examples below provide a more concrete illustration of the
5541 + concepts and typical usage patterns discussed above.
5543 + 'special' event fields
5544 + ------------------------
5546 + There are a number of 'special event fields' available for use as
5547 + keys or values in a hist trigger. These look like and behave as if
5548 + they were actual event fields, but aren't really part of the event's
5549 + field definition or format file. They are however available for any
5550 + event, and can be used anywhere an actual event field could be.
5553 + common_timestamp u64 - timestamp (from ring buffer) associated
5554 + with the event, in nanoseconds. May be
5555 + modified by .usecs to have timestamps
5556 + interpreted as microseconds.
5557 + cpu int - the cpu on which the event occurred.
5559 + Extended error information
5560 + --------------------------
5562 + For some error conditions encountered when invoking a hist trigger
5563 + command, extended error information is available via the
5564 + corresponding event's 'hist' file. Reading the hist file after an
5565 + error will display more detailed information about what went wrong,
5566 + if information is available. This extended error information will
5567 + be available until the next hist trigger command for that event.
5569 + If available for a given error condition, the extended error
5570 + information and usage takes the following form:
5572 + # echo xxx > /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger
5573 + echo: write error: Invalid argument
5575 + # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/hist
5576 + ERROR: Couldn't yyy: zzz
5579 +6.2 'hist' trigger examples
5580 +---------------------------
5582 + The first set of examples creates aggregations using the kmalloc
5583 + event. The fields that can be used for the hist trigger are listed
5584 + in the kmalloc event's format file:
5586 + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/format
5590 + field:unsigned short common_type; offset:0; size:2; signed:0;
5591 + field:unsigned char common_flags; offset:2; size:1; signed:0;
5592 + field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
5593 + field:int common_pid; offset:4; size:4; signed:1;
5595 + field:unsigned long call_site; offset:8; size:8; signed:0;
5596 + field:const void * ptr; offset:16; size:8; signed:0;
5597 + field:size_t bytes_req; offset:24; size:8; signed:0;
5598 + field:size_t bytes_alloc; offset:32; size:8; signed:0;
5599 + field:gfp_t gfp_flags; offset:40; size:4; signed:0;
5601 + We'll start by creating a hist trigger that generates a simple table
5602 + that lists the total number of bytes requested for each function in
5603 + the kernel that made one or more calls to kmalloc:
5605 + # echo 'hist:key=call_site:val=bytes_req' > \
5606 + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5608 + This tells the tracing system to create a 'hist' trigger using the
5609 + call_site field of the kmalloc event as the key for the table, which
5610 + just means that each unique call_site address will have an entry
5611 + created for it in the table. The 'val=bytes_req' parameter tells
5612 + the hist trigger that for each unique entry (call_site) in the
5613 + table, it should keep a running total of the number of bytes
5614 + requested by that call_site.
5616 + We'll let it run for awhile and then dump the contents of the 'hist'
5617 + file in the kmalloc event's subdirectory (for readability, a number
5618 + of entries have been omitted):
5620 + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5621 + # trigger info: hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
5623 + { call_site: 18446744072106379007 } hitcount: 1 bytes_req: 176
5624 + { call_site: 18446744071579557049 } hitcount: 1 bytes_req: 1024
5625 + { call_site: 18446744071580608289 } hitcount: 1 bytes_req: 16384
5626 + { call_site: 18446744071581827654 } hitcount: 1 bytes_req: 24
5627 + { call_site: 18446744071580700980 } hitcount: 1 bytes_req: 8
5628 + { call_site: 18446744071579359876 } hitcount: 1 bytes_req: 152
5629 + { call_site: 18446744071580795365 } hitcount: 3 bytes_req: 144
5630 + { call_site: 18446744071581303129 } hitcount: 3 bytes_req: 144
5631 + { call_site: 18446744071580713234 } hitcount: 4 bytes_req: 2560
5632 + { call_site: 18446744071580933750 } hitcount: 4 bytes_req: 736
5636 + { call_site: 18446744072106047046 } hitcount: 69 bytes_req: 5576
5637 + { call_site: 18446744071582116407 } hitcount: 73 bytes_req: 2336
5638 + { call_site: 18446744072106054684 } hitcount: 136 bytes_req: 140504
5639 + { call_site: 18446744072106224230 } hitcount: 136 bytes_req: 19584
5640 + { call_site: 18446744072106078074 } hitcount: 153 bytes_req: 2448
5641 + { call_site: 18446744072106062406 } hitcount: 153 bytes_req: 36720
5642 + { call_site: 18446744071582507929 } hitcount: 153 bytes_req: 37088
5643 + { call_site: 18446744072102520590 } hitcount: 273 bytes_req: 10920
5644 + { call_site: 18446744071582143559 } hitcount: 358 bytes_req: 716
5645 + { call_site: 18446744072106465852 } hitcount: 417 bytes_req: 56712
5646 + { call_site: 18446744072102523378 } hitcount: 485 bytes_req: 27160
5647 + { call_site: 18446744072099568646 } hitcount: 1676 bytes_req: 33520
5654 + The output displays a line for each entry, beginning with the key
5655 + specified in the trigger, followed by the value(s) also specified in
5656 + the trigger. At the beginning of the output is a line that displays
5657 + the trigger info, which can also be displayed by reading the
5660 + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5661 + hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
5663 + At the end of the output are a few lines that display the overall
5664 + totals for the run. The 'Hits' field shows the total number of
5665 + times the event trigger was hit, the 'Entries' field shows the total
5666 + number of used entries in the hash table, and the 'Dropped' field
5667 + shows the number of hits that were dropped because the number of
5668 + used entries for the run exceeded the maximum number of entries
5669 + allowed for the table (normally 0, but if not a hint that you may
5670 + want to increase the size of the table using the 'size' parameter).
5672 + Notice in the above output that there's an extra field, 'hitcount',
5673 + which wasn't specified in the trigger. Also notice that in the
5674 + trigger info output, there's a parameter, 'sort=hitcount', which
5675 + wasn't specified in the trigger either. The reason for that is that
5676 + every trigger implicitly keeps a count of the total number of hits
5677 + attributed to a given entry, called the 'hitcount'. That hitcount
5678 + information is explicitly displayed in the output, and in the
5679 + absence of a user-specified sort parameter, is used as the default
5682 + The value 'hitcount' can be used in place of an explicit value in
5683 + the 'values' parameter if you don't really need to have any
5684 + particular field summed and are mainly interested in hit
5687 + To turn the hist trigger off, simply call up the trigger in the
5688 + command history and re-execute it with a '!' prepended:
5690 + # echo '!hist:key=call_site:val=bytes_req' > \
5691 + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5693 + Finally, notice that the call_site as displayed in the output above
5694 + isn't really very useful. It's an address, but normally addresses
5695 + are displayed in hex. To have a numeric field displayed as a hex
5696 + value, simply append '.hex' to the field name in the trigger:
5698 + # echo 'hist:key=call_site.hex:val=bytes_req' > \
5699 + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5701 + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5702 + # trigger info: hist:keys=call_site.hex:vals=bytes_req:sort=hitcount:size=2048 [active]
5704 + { call_site: ffffffffa026b291 } hitcount: 1 bytes_req: 433
5705 + { call_site: ffffffffa07186ff } hitcount: 1 bytes_req: 176
5706 + { call_site: ffffffff811ae721 } hitcount: 1 bytes_req: 16384
5707 + { call_site: ffffffff811c5134 } hitcount: 1 bytes_req: 8
5708 + { call_site: ffffffffa04a9ebb } hitcount: 1 bytes_req: 511
5709 + { call_site: ffffffff8122e0a6 } hitcount: 1 bytes_req: 12
5710 + { call_site: ffffffff8107da84 } hitcount: 1 bytes_req: 152
5711 + { call_site: ffffffff812d8246 } hitcount: 1 bytes_req: 24
5712 + { call_site: ffffffff811dc1e5 } hitcount: 3 bytes_req: 144
5713 + { call_site: ffffffffa02515e8 } hitcount: 3 bytes_req: 648
5714 + { call_site: ffffffff81258159 } hitcount: 3 bytes_req: 144
5715 + { call_site: ffffffff811c80f4 } hitcount: 4 bytes_req: 544
5719 + { call_site: ffffffffa06c7646 } hitcount: 106 bytes_req: 8024
5720 + { call_site: ffffffffa06cb246 } hitcount: 132 bytes_req: 31680
5721 + { call_site: ffffffffa06cef7a } hitcount: 132 bytes_req: 2112
5722 + { call_site: ffffffff8137e399 } hitcount: 132 bytes_req: 23232
5723 + { call_site: ffffffffa06c941c } hitcount: 185 bytes_req: 171360
5724 + { call_site: ffffffffa06f2a66 } hitcount: 185 bytes_req: 26640
5725 + { call_site: ffffffffa036a70e } hitcount: 265 bytes_req: 10600
5726 + { call_site: ffffffff81325447 } hitcount: 292 bytes_req: 584
5727 + { call_site: ffffffffa072da3c } hitcount: 446 bytes_req: 60656
5728 + { call_site: ffffffffa036b1f2 } hitcount: 526 bytes_req: 29456
5729 + { call_site: ffffffffa0099c06 } hitcount: 1780 bytes_req: 35600
5736 + Even that's only marginally more useful - while hex values do look
5737 + more like addresses, what users are typically more interested in
5738 + when looking at text addresses are the corresponding symbols
5739 + instead. To have an address displayed as symbolic value instead,
5740 + simply append '.sym' or '.sym-offset' to the field name in the
5743 + # echo 'hist:key=call_site.sym:val=bytes_req' > \
5744 + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5746 + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5747 + # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=hitcount:size=2048 [active]
5749 + { call_site: [ffffffff810adcb9] syslog_print_all } hitcount: 1 bytes_req: 1024
5750 + { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
5751 + { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
5752 + { call_site: [ffffffff8154acbe] usb_alloc_urb } hitcount: 1 bytes_req: 192
5753 + { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
5754 + { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
5755 + { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
5756 + { call_site: [ffffffff811febd5] fsnotify_alloc_group } hitcount: 2 bytes_req: 528
5757 + { call_site: [ffffffff81440f58] __tty_buffer_request_room } hitcount: 2 bytes_req: 2624
5758 + { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 2 bytes_req: 96
5759 + { call_site: [ffffffffa05e19af] ieee80211_start_tx_ba_session [mac80211] } hitcount: 2 bytes_req: 464
5760 + { call_site: [ffffffff81672406] tcp_get_metrics } hitcount: 2 bytes_req: 304
5761 + { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
5762 + { call_site: [ffffffff81089b05] sched_create_group } hitcount: 2 bytes_req: 1424
5766 + { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1185 bytes_req: 123240
5767 + { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 1185 bytes_req: 104280
5768 + { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 1402 bytes_req: 190672
5769 + { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 1518 bytes_req: 146208
5770 + { call_site: [ffffffffa029070e] drm_vma_node_allow [drm] } hitcount: 1746 bytes_req: 69840
5771 + { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 2021 bytes_req: 792312
5772 + { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 2592 bytes_req: 145152
5773 + { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2629 bytes_req: 378576
5774 + { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2629 bytes_req: 3783248
5775 + { call_site: [ffffffff81325607] apparmor_file_alloc_security } hitcount: 5192 bytes_req: 10384
5776 + { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 5529 bytes_req: 110584
5777 + { call_site: [ffffffff8131ebf7] aa_alloc_task_context } hitcount: 21943 bytes_req: 702176
5778 + { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 55759 bytes_req: 5074265
5785 + Because the default sort key above is 'hitcount', the above shows a
5786 + the list of call_sites by increasing hitcount, so that at the bottom
5787 + we see the functions that made the most kmalloc calls during the
5788 + run. If instead we we wanted to see the top kmalloc callers in
5789 + terms of the number of bytes requested rather than the number of
5790 + calls, and we wanted the top caller to appear at the top, we can use
5791 + the 'sort' parameter, along with the 'descending' modifier:
5793 + # echo 'hist:key=call_site.sym:val=bytes_req:sort=bytes_req.descending' > \
5794 + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5796 + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5797 + # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
5799 + { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2186 bytes_req: 3397464
5800 + { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1790 bytes_req: 712176
5801 + { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 8132 bytes_req: 513135
5802 + { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 106 bytes_req: 440128
5803 + { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2186 bytes_req: 314784
5804 + { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 2174 bytes_req: 208992
5805 + { call_site: [ffffffff811ae8e1] __kmalloc } hitcount: 8 bytes_req: 131072
5806 + { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 859 bytes_req: 116824
5807 + { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 1834 bytes_req: 102704
5808 + { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 972 bytes_req: 101088
5809 + { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 972 bytes_req: 85536
5810 + { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 3333 bytes_req: 66664
5811 + { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 209 bytes_req: 61632
5815 + { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
5816 + { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
5817 + { call_site: [ffffffff812d8406] copy_semundo } hitcount: 2 bytes_req: 48
5818 + { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 1 bytes_req: 48
5819 + { call_site: [ffffffffa027121a] drm_getmagic [drm] } hitcount: 1 bytes_req: 48
5820 + { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
5821 + { call_site: [ffffffff811c52f4] bprm_change_interp } hitcount: 2 bytes_req: 16
5822 + { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
5823 + { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
5824 + { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
5831 + To display the offset and size information in addition to the symbol
5832 + name, just use 'sym-offset' instead:
5834 + # echo 'hist:key=call_site.sym-offset:val=bytes_req:sort=bytes_req.descending' > \
5835 + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5837 + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5838 + # trigger info: hist:keys=call_site.sym-offset:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
5840 + { call_site: [ffffffffa046041c] i915_gem_execbuffer2+0x6c/0x2c0 [i915] } hitcount: 4569 bytes_req: 3163720
5841 + { call_site: [ffffffffa0489a66] intel_ring_begin+0xc6/0x1f0 [i915] } hitcount: 4569 bytes_req: 657936
5842 + { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23+0x694/0x1020 [i915] } hitcount: 1519 bytes_req: 472936
5843 + { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23+0x516/0x1020 [i915] } hitcount: 3050 bytes_req: 211832
5844 + { call_site: [ffffffff811e2a1b] seq_buf_alloc+0x1b/0x50 } hitcount: 34 bytes_req: 148384
5845 + { call_site: [ffffffffa04a580c] intel_crtc_page_flip+0xbc/0x870 [i915] } hitcount: 1385 bytes_req: 144040
5846 + { call_site: [ffffffff811ae8e1] __kmalloc+0x191/0x1b0 } hitcount: 8 bytes_req: 131072
5847 + { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl+0x282/0x360 [drm] } hitcount: 1385 bytes_req: 121880
5848 + { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc+0x32/0x100 [drm] } hitcount: 1848 bytes_req: 103488
5849 + { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state+0x2c/0xa0 [i915] } hitcount: 461 bytes_req: 62696
5850 + { call_site: [ffffffffa029070e] drm_vma_node_allow+0x2e/0xd0 [drm] } hitcount: 1541 bytes_req: 61640
5851 + { call_site: [ffffffff815f8d7b] sk_prot_alloc+0xcb/0x1b0 } hitcount: 57 bytes_req: 57456
5855 + { call_site: [ffffffff8109524a] alloc_fair_sched_group+0x5a/0x1a0 } hitcount: 2 bytes_req: 128
5856 + { call_site: [ffffffffa027b921] drm_vm_open_locked+0x31/0xa0 [drm] } hitcount: 3 bytes_req: 96
5857 + { call_site: [ffffffff8122e266] proc_self_follow_link+0x76/0xb0 } hitcount: 8 bytes_req: 96
5858 + { call_site: [ffffffff81213e80] load_elf_binary+0x240/0x1650 } hitcount: 3 bytes_req: 84
5859 + { call_site: [ffffffff8154bc62] usb_control_msg+0x42/0x110 } hitcount: 1 bytes_req: 8
5860 + { call_site: [ffffffffa00bf6fe] hidraw_send_report+0x7e/0x1a0 [hid] } hitcount: 1 bytes_req: 7
5861 + { call_site: [ffffffffa00bf1ca] hidraw_report_event+0x8a/0x120 [hid] } hitcount: 1 bytes_req: 7
5868 + We can also add multiple fields to the 'values' parameter. For
5869 + example, we might want to see the total number of bytes allocated
5870 + alongside bytes requested, and display the result sorted by bytes
5871 + allocated in a descending order:
5873 + # echo 'hist:keys=call_site.sym:values=bytes_req,bytes_alloc:sort=bytes_alloc.descending' > \
5874 + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5876 + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5877 + # trigger info: hist:keys=call_site.sym:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending:size=2048 [active]
5879 + { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 7403 bytes_req: 4084360 bytes_alloc: 5958016
5880 + { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 541 bytes_req: 2213968 bytes_alloc: 2228224
5881 + { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 7404 bytes_req: 1066176 bytes_alloc: 1421568
5882 + { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1565 bytes_req: 557368 bytes_alloc: 1037760
5883 + { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 9557 bytes_req: 595778 bytes_alloc: 695744
5884 + { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 5839 bytes_req: 430680 bytes_alloc: 470400
5885 + { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 2388 bytes_req: 324768 bytes_alloc: 458496
5886 + { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 3911 bytes_req: 219016 bytes_alloc: 250304
5887 + { call_site: [ffffffff815f8d7b] sk_prot_alloc } hitcount: 235 bytes_req: 236880 bytes_alloc: 240640
5888 + { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 557 bytes_req: 169024 bytes_alloc: 221760
5889 + { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 9378 bytes_req: 187548 bytes_alloc: 206312
5890 + { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1519 bytes_req: 157976 bytes_alloc: 194432
5894 + { call_site: [ffffffff8109bd3b] sched_autogroup_create_attach } hitcount: 2 bytes_req: 144 bytes_alloc: 192
5895 + { call_site: [ffffffff81097ee8] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
5896 + { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
5897 + { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
5898 + { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
5899 + { call_site: [ffffffff81213e80] load_elf_binary } hitcount: 3 bytes_req: 84 bytes_alloc: 96
5900 + { call_site: [ffffffff81079a2e] kthread_create_on_node } hitcount: 1 bytes_req: 56 bytes_alloc: 64
5901 + { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
5902 + { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 bytes_alloc: 8
5903 + { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
5910 + Finally, to finish off our kmalloc example, instead of simply having
5911 + the hist trigger display symbolic call_sites, we can have the hist
5912 + trigger additionally display the complete set of kernel stack traces
5913 + that led to each call_site. To do that, we simply use the special
5914 + value 'stacktrace' for the key parameter:
5916 + # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \
5917 + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5919 + The above trigger will use the kernel stack trace in effect when an
5920 + event is triggered as the key for the hash table. This allows the
5921 + enumeration of every kernel callpath that led up to a particular
5922 + event, along with a running total of any of the event fields for
5923 + that event. Here we tally bytes requested and bytes allocated for
5924 + every callpath in the system that led up to a kmalloc (in this case
5925 + every callpath to a kmalloc for a kernel compile):
5927 + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5928 + # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active]
5931 + __kmalloc_track_caller+0x10b/0x1a0
5933 + hidraw_report_event+0x8a/0x120 [hid]
5934 + hid_report_raw_event+0x3ea/0x440 [hid]
5935 + hid_input_report+0x112/0x190 [hid]
5936 + hid_irq_in+0xc2/0x260 [usbhid]
5937 + __usb_hcd_giveback_urb+0x72/0x120
5938 + usb_giveback_urb_bh+0x9e/0xe0
5939 + tasklet_hi_action+0xf8/0x100
5940 + __do_softirq+0x114/0x2c0
5941 + irq_exit+0xa5/0xb0
5943 + ret_from_intr+0x0/0x30
5944 + cpuidle_enter+0x17/0x20
5945 + cpu_startup_entry+0x315/0x3e0
5946 + rest_init+0x7c/0x80
5947 + } hitcount: 3 bytes_req: 21 bytes_alloc: 24
5949 + __kmalloc_track_caller+0x10b/0x1a0
5951 + hidraw_report_event+0x8a/0x120 [hid]
5952 + hid_report_raw_event+0x3ea/0x440 [hid]
5953 + hid_input_report+0x112/0x190 [hid]
5954 + hid_irq_in+0xc2/0x260 [usbhid]
5955 + __usb_hcd_giveback_urb+0x72/0x120
5956 + usb_giveback_urb_bh+0x9e/0xe0
5957 + tasklet_hi_action+0xf8/0x100
5958 + __do_softirq+0x114/0x2c0
5959 + irq_exit+0xa5/0xb0
5961 + ret_from_intr+0x0/0x30
5962 + } hitcount: 3 bytes_req: 21 bytes_alloc: 24
5964 + kmem_cache_alloc_trace+0xeb/0x150
5965 + aa_alloc_task_context+0x27/0x40
5966 + apparmor_cred_prepare+0x1f/0x50
5967 + security_prepare_creds+0x16/0x20
5968 + prepare_creds+0xdf/0x1a0
5969 + SyS_capset+0xb5/0x200
5970 + system_call_fastpath+0x12/0x6a
5971 + } hitcount: 1 bytes_req: 32 bytes_alloc: 32
5976 + __kmalloc+0x11b/0x1b0
5977 + i915_gem_execbuffer2+0x6c/0x2c0 [i915]
5978 + drm_ioctl+0x349/0x670 [drm]
5979 + do_vfs_ioctl+0x2f0/0x4f0
5980 + SyS_ioctl+0x81/0xa0
5981 + system_call_fastpath+0x12/0x6a
5982 + } hitcount: 17726 bytes_req: 13944120 bytes_alloc: 19593808
5984 + __kmalloc+0x11b/0x1b0
5985 + load_elf_phdrs+0x76/0xa0
5986 + load_elf_binary+0x102/0x1650
5987 + search_binary_handler+0x97/0x1d0
5988 + do_execveat_common.isra.34+0x551/0x6e0
5989 + SyS_execve+0x3a/0x50
5990 + return_from_execve+0x0/0x23
5991 + } hitcount: 33348 bytes_req: 17152128 bytes_alloc: 20226048
5993 + kmem_cache_alloc_trace+0xeb/0x150
5994 + apparmor_file_alloc_security+0x27/0x40
5995 + security_file_alloc+0x16/0x20
5996 + get_empty_filp+0x93/0x1c0
5997 + path_openat+0x31/0x5f0
5998 + do_filp_open+0x3a/0x90
5999 + do_sys_open+0x128/0x220
6000 + SyS_open+0x1e/0x20
6001 + system_call_fastpath+0x12/0x6a
6002 + } hitcount: 4766422 bytes_req: 9532844 bytes_alloc: 38131376
6004 + __kmalloc+0x11b/0x1b0
6005 + seq_buf_alloc+0x1b/0x50
6006 + seq_read+0x2cc/0x370
6007 + proc_reg_read+0x3d/0x80
6008 + __vfs_read+0x28/0xe0
6009 + vfs_read+0x86/0x140
6010 + SyS_read+0x46/0xb0
6011 + system_call_fastpath+0x12/0x6a
6012 + } hitcount: 19133 bytes_req: 78368768 bytes_alloc: 78368768
6019 + If you key a hist trigger on common_pid, in order for example to
6020 + gather and display sorted totals for each process, you can use the
6021 + special .execname modifier to display the executable names for the
6022 + processes in the table rather than raw pids. The example below
6023 + keeps a per-process sum of total bytes read:
6025 + # echo 'hist:key=common_pid.execname:val=count:sort=count.descending' > \
6026 + /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
6028 + # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/hist
6029 + # trigger info: hist:keys=common_pid.execname:vals=count:sort=count.descending:size=2048 [active]
6031 + { common_pid: gnome-terminal [ 3196] } hitcount: 280 count: 1093512
6032 + { common_pid: Xorg [ 1309] } hitcount: 525 count: 256640
6033 + { common_pid: compiz [ 2889] } hitcount: 59 count: 254400
6034 + { common_pid: bash [ 8710] } hitcount: 3 count: 66369
6035 + { common_pid: dbus-daemon-lau [ 8703] } hitcount: 49 count: 47739
6036 + { common_pid: irqbalance [ 1252] } hitcount: 27 count: 27648
6037 + { common_pid: 01ifupdown [ 8705] } hitcount: 3 count: 17216
6038 + { common_pid: dbus-daemon [ 772] } hitcount: 10 count: 12396
6039 + { common_pid: Socket Thread [ 8342] } hitcount: 11 count: 11264
6040 + { common_pid: nm-dhcp-client. [ 8701] } hitcount: 6 count: 7424
6041 + { common_pid: gmain [ 1315] } hitcount: 18 count: 6336
6045 + { common_pid: postgres [ 1892] } hitcount: 2 count: 32
6046 + { common_pid: postgres [ 1891] } hitcount: 2 count: 32
6047 + { common_pid: gmain [ 8704] } hitcount: 2 count: 32
6048 + { common_pid: upstart-dbus-br [ 2740] } hitcount: 21 count: 21
6049 + { common_pid: nm-dispatcher.a [ 8696] } hitcount: 1 count: 16
6050 + { common_pid: indicator-datet [ 2904] } hitcount: 1 count: 16
6051 + { common_pid: gdbus [ 2998] } hitcount: 1 count: 16
6052 + { common_pid: rtkit-daemon [ 2052] } hitcount: 1 count: 8
6053 + { common_pid: init [ 1] } hitcount: 2 count: 2
6060 + Similarly, if you key a hist trigger on syscall id, for example to
6061 + gather and display a list of systemwide syscall hits, you can use
6062 + the special .syscall modifier to display the syscall names rather
6063 + than raw ids. The example below keeps a running total of syscall
6064 + counts for the system during the run:
6066 + # echo 'hist:key=id.syscall:val=hitcount' > \
6067 + /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
6069 + # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
6070 + # trigger info: hist:keys=id.syscall:vals=hitcount:sort=hitcount:size=2048 [active]
6072 + { id: sys_fsync [ 74] } hitcount: 1
6073 + { id: sys_newuname [ 63] } hitcount: 1
6074 + { id: sys_prctl [157] } hitcount: 1
6075 + { id: sys_statfs [137] } hitcount: 1
6076 + { id: sys_symlink [ 88] } hitcount: 1
6077 + { id: sys_sendmmsg [307] } hitcount: 1
6078 + { id: sys_semctl [ 66] } hitcount: 1
6079 + { id: sys_readlink [ 89] } hitcount: 3
6080 + { id: sys_bind [ 49] } hitcount: 3
6081 + { id: sys_getsockname [ 51] } hitcount: 3
6082 + { id: sys_unlink [ 87] } hitcount: 3
6083 + { id: sys_rename [ 82] } hitcount: 4
6084 + { id: unknown_syscall [ 58] } hitcount: 4
6085 + { id: sys_connect [ 42] } hitcount: 4
6086 + { id: sys_getpid [ 39] } hitcount: 4
6090 + { id: sys_rt_sigprocmask [ 14] } hitcount: 952
6091 + { id: sys_futex [202] } hitcount: 1534
6092 + { id: sys_write [ 1] } hitcount: 2689
6093 + { id: sys_setitimer [ 38] } hitcount: 2797
6094 + { id: sys_read [ 0] } hitcount: 3202
6095 + { id: sys_select [ 23] } hitcount: 3773
6096 + { id: sys_writev [ 20] } hitcount: 4531
6097 + { id: sys_poll [ 7] } hitcount: 8314
6098 + { id: sys_recvmsg [ 47] } hitcount: 13738
6099 + { id: sys_ioctl [ 16] } hitcount: 21843
6106 + The syscall counts above provide a rough overall picture of system
6107 + call activity on the system; we can see for example that the most
6108 + popular system call on this system was the 'sys_ioctl' system call.
6110 + We can use 'compound' keys to refine that number and provide some
6111 + further insight as to which processes exactly contribute to the
6112 + overall ioctl count.
6114 + The command below keeps a hitcount for every unique combination of
6115 + system call id and pid - the end result is essentially a table
6116 + that keeps a per-pid sum of system call hits. The results are
6117 + sorted using the system call id as the primary key, and the
6118 + hitcount sum as the secondary key:
6120 + # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount' > \
6121 + /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
6123 + # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
6124 + # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 [active]
6126 + { id: sys_read [ 0], common_pid: rtkit-daemon [ 1877] } hitcount: 1
6127 + { id: sys_read [ 0], common_pid: gdbus [ 2976] } hitcount: 1
6128 + { id: sys_read [ 0], common_pid: console-kit-dae [ 3400] } hitcount: 1
6129 + { id: sys_read [ 0], common_pid: postgres [ 1865] } hitcount: 1
6130 + { id: sys_read [ 0], common_pid: deja-dup-monito [ 3543] } hitcount: 2
6131 + { id: sys_read [ 0], common_pid: NetworkManager [ 890] } hitcount: 2
6132 + { id: sys_read [ 0], common_pid: evolution-calen [ 3048] } hitcount: 2
6133 + { id: sys_read [ 0], common_pid: postgres [ 1864] } hitcount: 2
6134 + { id: sys_read [ 0], common_pid: nm-applet [ 3022] } hitcount: 2
6135 + { id: sys_read [ 0], common_pid: whoopsie [ 1212] } hitcount: 2
6139 + { id: sys_ioctl [ 16], common_pid: bash [ 8479] } hitcount: 1
6140 + { id: sys_ioctl [ 16], common_pid: bash [ 3472] } hitcount: 12
6141 + { id: sys_ioctl [ 16], common_pid: gnome-terminal [ 3199] } hitcount: 16
6142 + { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 1808
6143 + { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 5580
6147 + { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2690] } hitcount: 3
6148 + { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2688] } hitcount: 16
6149 + { id: sys_inotify_add_watch [254], common_pid: gmain [ 975] } hitcount: 2
6150 + { id: sys_inotify_add_watch [254], common_pid: gmain [ 3204] } hitcount: 4
6151 + { id: sys_inotify_add_watch [254], common_pid: gmain [ 2888] } hitcount: 4
6152 + { id: sys_inotify_add_watch [254], common_pid: gmain [ 3003] } hitcount: 4
6153 + { id: sys_inotify_add_watch [254], common_pid: gmain [ 2873] } hitcount: 4
6154 + { id: sys_inotify_add_watch [254], common_pid: gmain [ 3196] } hitcount: 6
6155 + { id: sys_openat [257], common_pid: java [ 2623] } hitcount: 2
6156 + { id: sys_eventfd2 [290], common_pid: ibus-ui-gtk3 [ 2760] } hitcount: 4
6157 + { id: sys_eventfd2 [290], common_pid: compiz [ 2994] } hitcount: 6
6164 + The above list does give us a breakdown of the ioctl syscall by
6165 + pid, but it also gives us quite a bit more than that, which we
6166 + don't really care about at the moment. Since we know the syscall
6167 + id for sys_ioctl (16, displayed next to the sys_ioctl name), we
6168 + can use that to filter out all the other syscalls:
6170 + # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount if id == 16' > \
6171 + /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
6173 + # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
6174 + # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 if id == 16 [active]
6176 + { id: sys_ioctl [ 16], common_pid: gmain [ 2769] } hitcount: 1
6177 + { id: sys_ioctl [ 16], common_pid: evolution-addre [ 8571] } hitcount: 1
6178 + { id: sys_ioctl [ 16], common_pid: gmain [ 3003] } hitcount: 1
6179 + { id: sys_ioctl [ 16], common_pid: gmain [ 2781] } hitcount: 1
6180 + { id: sys_ioctl [ 16], common_pid: gmain [ 2829] } hitcount: 1
6181 + { id: sys_ioctl [ 16], common_pid: bash [ 8726] } hitcount: 1
6182 + { id: sys_ioctl [ 16], common_pid: bash [ 8508] } hitcount: 1
6183 + { id: sys_ioctl [ 16], common_pid: gmain [ 2970] } hitcount: 1
6184 + { id: sys_ioctl [ 16], common_pid: gmain [ 2768] } hitcount: 1
6188 + { id: sys_ioctl [ 16], common_pid: pool [ 8559] } hitcount: 45
6189 + { id: sys_ioctl [ 16], common_pid: pool [ 8555] } hitcount: 48
6190 + { id: sys_ioctl [ 16], common_pid: pool [ 8551] } hitcount: 48
6191 + { id: sys_ioctl [ 16], common_pid: avahi-daemon [ 896] } hitcount: 66
6192 + { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 26674
6193 + { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 73443
6200 + The above output shows that 'compiz' and 'Xorg' are far and away
6201 + the heaviest ioctl callers (which might lead to questions about
6202 + whether they really need to be making all those calls and to
6203 + possible avenues for further investigation.)
6205 + The compound key examples used a key and a sum value (hitcount) to
6206 + sort the output, but we can just as easily use two keys instead.
6207 + Here's an example where we use a compound key composed of the the
6208 + common_pid and size event fields. Sorting with pid as the primary
6209 + key and 'size' as the secondary key allows us to display an
6210 + ordered summary of the recvfrom sizes, with counts, received by
6213 + # echo 'hist:key=common_pid.execname,size:val=hitcount:sort=common_pid,size' > \
6214 + /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/trigger
6216 + # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/hist
6217 + # trigger info: hist:keys=common_pid.execname,size:vals=hitcount:sort=common_pid.execname,size:size=2048 [active]
6219 + { common_pid: smbd [ 784], size: 4 } hitcount: 1
6220 + { common_pid: dnsmasq [ 1412], size: 4096 } hitcount: 672
6221 + { common_pid: postgres [ 1796], size: 1000 } hitcount: 6
6222 + { common_pid: postgres [ 1867], size: 1000 } hitcount: 10
6223 + { common_pid: bamfdaemon [ 2787], size: 28 } hitcount: 2
6224 + { common_pid: bamfdaemon [ 2787], size: 14360 } hitcount: 1
6225 + { common_pid: compiz [ 2994], size: 8 } hitcount: 1
6226 + { common_pid: compiz [ 2994], size: 20 } hitcount: 11
6227 + { common_pid: gnome-terminal [ 3199], size: 4 } hitcount: 2
6228 + { common_pid: firefox [ 8817], size: 4 } hitcount: 1
6229 + { common_pid: firefox [ 8817], size: 8 } hitcount: 5
6230 + { common_pid: firefox [ 8817], size: 588 } hitcount: 2
6231 + { common_pid: firefox [ 8817], size: 628 } hitcount: 1
6232 + { common_pid: firefox [ 8817], size: 6944 } hitcount: 1
6233 + { common_pid: firefox [ 8817], size: 408880 } hitcount: 2
6234 + { common_pid: firefox [ 8822], size: 8 } hitcount: 2
6235 + { common_pid: firefox [ 8822], size: 160 } hitcount: 2
6236 + { common_pid: firefox [ 8822], size: 320 } hitcount: 2
6237 + { common_pid: firefox [ 8822], size: 352 } hitcount: 1
6241 + { common_pid: pool [ 8923], size: 1960 } hitcount: 10
6242 + { common_pid: pool [ 8923], size: 2048 } hitcount: 10
6243 + { common_pid: pool [ 8924], size: 1960 } hitcount: 10
6244 + { common_pid: pool [ 8924], size: 2048 } hitcount: 10
6245 + { common_pid: pool [ 8928], size: 1964 } hitcount: 4
6246 + { common_pid: pool [ 8928], size: 1965 } hitcount: 2
6247 + { common_pid: pool [ 8928], size: 2048 } hitcount: 6
6248 + { common_pid: pool [ 8929], size: 1982 } hitcount: 1
6249 + { common_pid: pool [ 8929], size: 2048 } hitcount: 1
6256 + The above example also illustrates the fact that although a compound
6257 + key is treated as a single entity for hashing purposes, the sub-keys
6258 + it's composed of can be accessed independently.
6260 + The next example uses a string field as the hash key and
6261 + demonstrates how you can manually pause and continue a hist trigger.
6262 + In this example, we'll aggregate fork counts and don't expect a
6263 + large number of entries in the hash table, so we'll drop it to a
6264 + much smaller number, say 256:
6266 + # echo 'hist:key=child_comm:val=hitcount:size=256' > \
6267 + /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
6269 + # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
6270 + # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
6272 + { child_comm: dconf worker } hitcount: 1
6273 + { child_comm: ibus-daemon } hitcount: 1
6274 + { child_comm: whoopsie } hitcount: 1
6275 + { child_comm: smbd } hitcount: 1
6276 + { child_comm: gdbus } hitcount: 1
6277 + { child_comm: kthreadd } hitcount: 1
6278 + { child_comm: dconf worker } hitcount: 1
6279 + { child_comm: evolution-alarm } hitcount: 2
6280 + { child_comm: Socket Thread } hitcount: 2
6281 + { child_comm: postgres } hitcount: 2
6282 + { child_comm: bash } hitcount: 3
6283 + { child_comm: compiz } hitcount: 3
6284 + { child_comm: evolution-sourc } hitcount: 4
6285 + { child_comm: dhclient } hitcount: 4
6286 + { child_comm: pool } hitcount: 5
6287 + { child_comm: nm-dispatcher.a } hitcount: 8
6288 + { child_comm: firefox } hitcount: 8
6289 + { child_comm: dbus-daemon } hitcount: 8
6290 + { child_comm: glib-pacrunner } hitcount: 10
6291 + { child_comm: evolution } hitcount: 23
6298 + If we want to pause the hist trigger, we can simply append :pause to
6299 + the command that started the trigger. Notice that the trigger info
6300 + displays as [paused]:
6302 + # echo 'hist:key=child_comm:val=hitcount:size=256:pause' >> \
6303 + /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
6305 + # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
6306 + # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [paused]
6308 + { child_comm: dconf worker } hitcount: 1
6309 + { child_comm: kthreadd } hitcount: 1
6310 + { child_comm: dconf worker } hitcount: 1
6311 + { child_comm: gdbus } hitcount: 1
6312 + { child_comm: ibus-daemon } hitcount: 1
6313 + { child_comm: Socket Thread } hitcount: 2
6314 + { child_comm: evolution-alarm } hitcount: 2
6315 + { child_comm: smbd } hitcount: 2
6316 + { child_comm: bash } hitcount: 3
6317 + { child_comm: whoopsie } hitcount: 3
6318 + { child_comm: compiz } hitcount: 3
6319 + { child_comm: evolution-sourc } hitcount: 4
6320 + { child_comm: pool } hitcount: 5
6321 + { child_comm: postgres } hitcount: 6
6322 + { child_comm: firefox } hitcount: 8
6323 + { child_comm: dhclient } hitcount: 10
6324 + { child_comm: emacs } hitcount: 12
6325 + { child_comm: dbus-daemon } hitcount: 20
6326 + { child_comm: nm-dispatcher.a } hitcount: 20
6327 + { child_comm: evolution } hitcount: 35
6328 + { child_comm: glib-pacrunner } hitcount: 59
6335 + To manually continue having the trigger aggregate events, append
6336 + :cont instead. Notice that the trigger info displays as [active]
6337 + again, and the data has changed:
6339 + # echo 'hist:key=child_comm:val=hitcount:size=256:cont' >> \
6340 + /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
6342 + # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
6343 + # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
6345 + { child_comm: dconf worker } hitcount: 1
6346 + { child_comm: dconf worker } hitcount: 1
6347 + { child_comm: kthreadd } hitcount: 1
6348 + { child_comm: gdbus } hitcount: 1
6349 + { child_comm: ibus-daemon } hitcount: 1
6350 + { child_comm: Socket Thread } hitcount: 2
6351 + { child_comm: evolution-alarm } hitcount: 2
6352 + { child_comm: smbd } hitcount: 2
6353 + { child_comm: whoopsie } hitcount: 3
6354 + { child_comm: compiz } hitcount: 3
6355 + { child_comm: evolution-sourc } hitcount: 4
6356 + { child_comm: bash } hitcount: 5
6357 + { child_comm: pool } hitcount: 5
6358 + { child_comm: postgres } hitcount: 6
6359 + { child_comm: firefox } hitcount: 8
6360 + { child_comm: dhclient } hitcount: 11
6361 + { child_comm: emacs } hitcount: 12
6362 + { child_comm: dbus-daemon } hitcount: 22
6363 + { child_comm: nm-dispatcher.a } hitcount: 22
6364 + { child_comm: evolution } hitcount: 35
6365 + { child_comm: glib-pacrunner } hitcount: 59
6372 + The previous example showed how to start and stop a hist trigger by
6373 + appending 'pause' and 'continue' to the hist trigger command. A
6374 + hist trigger can also be started in a paused state by initially
6375 + starting the trigger with ':pause' appended. This allows you to
6376 + start the trigger only when you're ready to start collecting data
6377 + and not before. For example, you could start the trigger in a
6378 + paused state, then unpause it and do something you want to measure,
6379 + then pause the trigger again when done.
6381 + Of course, doing this manually can be difficult and error-prone, but
6382 + it is possible to automatically start and stop a hist trigger based
6383 + on some condition, via the enable_hist and disable_hist triggers.
6385 + For example, suppose we wanted to take a look at the relative
6386 + weights in terms of skb length for each callpath that leads to a
6387 + netif_receieve_skb event when downloading a decent-sized file using
6390 + First we set up an initially paused stacktrace trigger on the
6391 + netif_receive_skb event:
6393 + # echo 'hist:key=stacktrace:vals=len:pause' > \
6394 + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6396 + Next, we set up an 'enable_hist' trigger on the sched_process_exec
6397 + event, with an 'if filename==/usr/bin/wget' filter. The effect of
6398 + this new trigger is that it will 'unpause' the hist trigger we just
6399 + set up on netif_receive_skb if and only if it sees a
6400 + sched_process_exec event with a filename of '/usr/bin/wget'. When
6401 + that happens, all netif_receive_skb events are aggregated into a
6402 + hash table keyed on stacktrace:
6404 + # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
6405 + /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
6407 + The aggregation continues until the netif_receive_skb is paused
6408 + again, which is what the following disable_hist event does by
6409 + creating a similar setup on the sched_process_exit event, using the
6410 + filter 'comm==wget':
6412 + # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
6413 + /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
6415 + Whenever a process exits and the comm field of the disable_hist
6416 + trigger filter matches 'comm==wget', the netif_receive_skb hist
6417 + trigger is disabled.
6419 + The overall effect is that netif_receive_skb events are aggregated
6420 + into the hash table for only the duration of the wget. Executing a
6421 + wget command and then listing the 'hist' file will display the
6422 + output generated by the wget command:
6424 + $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
6426 + # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
6427 + # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
6430 + __netif_receive_skb_core+0x46d/0x990
6431 + __netif_receive_skb+0x18/0x60
6432 + netif_receive_skb_internal+0x23/0x90
6433 + napi_gro_receive+0xc8/0x100
6434 + ieee80211_deliver_skb+0xd6/0x270 [mac80211]
6435 + ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
6436 + ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
6437 + ieee80211_rx+0x31d/0x900 [mac80211]
6438 + iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
6439 + iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
6440 + iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
6441 + irq_thread_fn+0x20/0x50
6442 + irq_thread+0x11f/0x150
6444 + ret_from_fork+0x42/0x70
6445 + } hitcount: 85 len: 28884
6447 + __netif_receive_skb_core+0x46d/0x990
6448 + __netif_receive_skb+0x18/0x60
6449 + netif_receive_skb_internal+0x23/0x90
6450 + napi_gro_complete+0xa4/0xe0
6451 + dev_gro_receive+0x23a/0x360
6452 + napi_gro_receive+0x30/0x100
6453 + ieee80211_deliver_skb+0xd6/0x270 [mac80211]
6454 + ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
6455 + ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
6456 + ieee80211_rx+0x31d/0x900 [mac80211]
6457 + iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
6458 + iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
6459 + iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
6460 + irq_thread_fn+0x20/0x50
6461 + irq_thread+0x11f/0x150
6463 + } hitcount: 98 len: 664329
6465 + __netif_receive_skb_core+0x46d/0x990
6466 + __netif_receive_skb+0x18/0x60
6467 + process_backlog+0xa8/0x150
6468 + net_rx_action+0x15d/0x340
6469 + __do_softirq+0x114/0x2c0
6470 + do_softirq_own_stack+0x1c/0x30
6471 + do_softirq+0x65/0x70
6472 + __local_bh_enable_ip+0xb5/0xc0
6473 + ip_finish_output+0x1f4/0x840
6474 + ip_output+0x6b/0xc0
6475 + ip_local_out_sk+0x31/0x40
6476 + ip_send_skb+0x1a/0x50
6477 + udp_send_skb+0x173/0x2a0
6478 + udp_sendmsg+0x2bf/0x9f0
6479 + inet_sendmsg+0x64/0xa0
6480 + sock_sendmsg+0x3d/0x50
6481 + } hitcount: 115 len: 13030
6483 + __netif_receive_skb_core+0x46d/0x990
6484 + __netif_receive_skb+0x18/0x60
6485 + netif_receive_skb_internal+0x23/0x90
6486 + napi_gro_complete+0xa4/0xe0
6487 + napi_gro_flush+0x6d/0x90
6488 + iwl_pcie_irq_handler+0x92a/0x12f0 [iwlwifi]
6489 + irq_thread_fn+0x20/0x50
6490 + irq_thread+0x11f/0x150
6492 + ret_from_fork+0x42/0x70
6493 + } hitcount: 934 len: 5512212
6500 + The above shows all the netif_receive_skb callpaths and their total
6501 + lengths for the duration of the wget command.
6503 + The 'clear' hist trigger param can be used to clear the hash table.
6504 + Suppose we wanted to try another run of the previous example but
6505 + this time also wanted to see the complete list of events that went
6506 + into the histogram. In order to avoid having to set everything up
6507 + again, we can just clear the histogram first:
6509 + # echo 'hist:key=stacktrace:vals=len:clear' >> \
6510 + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6512 + Just to verify that it is in fact cleared, here's what we now see in
6515 + # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
6516 + # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
6523 + Since we want to see the detailed list of every netif_receive_skb
6524 + event occurring during the new run, which are in fact the same
6525 + events being aggregated into the hash table, we add some additional
6526 + 'enable_event' events to the triggering sched_process_exec and
6527 + sched_process_exit events as such:
6529 + # echo 'enable_event:net:netif_receive_skb if filename==/usr/bin/wget' > \
6530 + /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
6532 + # echo 'disable_event:net:netif_receive_skb if comm==wget' > \
6533 + /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
6535 + If you read the trigger files for the sched_process_exec and
6536 + sched_process_exit triggers, you should see two triggers for each:
6537 + one enabling/disabling the hist aggregation and the other
6538 + enabling/disabling the logging of events:
6540 + # cat /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
6541 + enable_event:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
6542 + enable_hist:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
6544 + # cat /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
6545 + enable_event:net:netif_receive_skb:unlimited if comm==wget
6546 + disable_hist:net:netif_receive_skb:unlimited if comm==wget
6548 + In other words, whenever either of the sched_process_exec or
6549 + sched_process_exit events is hit and matches 'wget', it enables or
6550 + disables both the histogram and the event log, and what you end up
6551 + with is a hash table and set of events just covering the specified
6552 + duration. Run the wget command again:
6554 + $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
6556 + Displaying the 'hist' file should show something similar to what you
6557 + saw in the last run, but this time you should also see the
6558 + individual events in the trace file:
6560 + # cat /sys/kernel/debug/tracing/trace
6564 + # entries-in-buffer/entries-written: 183/1426 #P:4
6566 + # _-----=> irqs-off
6567 + # / _----=> need-resched
6568 + # | / _---=> hardirq/softirq
6569 + # || / _--=> preempt-depth
6571 + # TASK-PID CPU# |||| TIMESTAMP FUNCTION
6573 + wget-15108 [000] ..s1 31769.606929: netif_receive_skb: dev=lo skbaddr=ffff88009c353100 len=60
6574 + wget-15108 [000] ..s1 31769.606999: netif_receive_skb: dev=lo skbaddr=ffff88009c353200 len=60
6575 + dnsmasq-1382 [000] ..s1 31769.677652: netif_receive_skb: dev=lo skbaddr=ffff88009c352b00 len=130
6576 + dnsmasq-1382 [000] ..s1 31769.685917: netif_receive_skb: dev=lo skbaddr=ffff88009c352200 len=138
6577 + ##### CPU 2 buffer started ####
6578 + irq/29-iwlwifi-559 [002] ..s. 31772.031529: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433d00 len=2948
6579 + irq/29-iwlwifi-559 [002] ..s. 31772.031572: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432200 len=1500
6580 + irq/29-iwlwifi-559 [002] ..s. 31772.032196: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433100 len=2948
6581 + irq/29-iwlwifi-559 [002] ..s. 31772.032761: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433000 len=2948
6582 + irq/29-iwlwifi-559 [002] ..s. 31772.033220: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432e00 len=1500
6587 + The following example demonstrates how multiple hist triggers can be
6588 + attached to a given event. This capability can be useful for
6589 + creating a set of different summaries derived from the same set of
6590 + events, or for comparing the effects of different filters, among
6593 + # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \
6594 + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6595 + # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \
6596 + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6597 + # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \
6598 + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6599 + # echo 'hist:keys=skbaddr.hex:vals=len' >> \
6600 + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6601 + # echo 'hist:keys=len:vals=common_preempt_count' >> \
6602 + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6604 + The above set of commands create four triggers differing only in
6605 + their filters, along with a completely different though fairly
6606 + nonsensical trigger. Note that in order to append multiple hist
6607 + triggers to the same file, you should use the '>>' operator to
6608 + append them ('>' will also add the new hist trigger, but will remove
6609 + any existing hist triggers beforehand).
6611 + Displaying the contents of the 'hist' file for the event shows the
6612 + contents of all five histograms:
6614 + # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
6618 + # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active]
6621 + { len: 176 } hitcount: 1 common_preempt_count: 0
6622 + { len: 223 } hitcount: 1 common_preempt_count: 0
6623 + { len: 4854 } hitcount: 1 common_preempt_count: 0
6624 + { len: 395 } hitcount: 1 common_preempt_count: 0
6625 + { len: 177 } hitcount: 1 common_preempt_count: 0
6626 + { len: 446 } hitcount: 1 common_preempt_count: 0
6627 + { len: 1601 } hitcount: 1 common_preempt_count: 0
6631 + { len: 1280 } hitcount: 66 common_preempt_count: 0
6632 + { len: 116 } hitcount: 81 common_preempt_count: 40
6633 + { len: 708 } hitcount: 112 common_preempt_count: 0
6634 + { len: 46 } hitcount: 221 common_preempt_count: 0
6635 + { len: 1264 } hitcount: 458 common_preempt_count: 0
6645 + # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
6648 + { skbaddr: ffff8800baee5e00 } hitcount: 1 len: 130
6649 + { skbaddr: ffff88005f3d5600 } hitcount: 1 len: 1280
6650 + { skbaddr: ffff88005f3d4900 } hitcount: 1 len: 1280
6651 + { skbaddr: ffff88009fed6300 } hitcount: 1 len: 115
6652 + { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 115
6653 + { skbaddr: ffff88008cdb1900 } hitcount: 1 len: 46
6654 + { skbaddr: ffff880064b5ef00 } hitcount: 1 len: 118
6655 + { skbaddr: ffff880044e3c700 } hitcount: 1 len: 60
6656 + { skbaddr: ffff880100065900 } hitcount: 1 len: 46
6657 + { skbaddr: ffff8800d46bd500 } hitcount: 1 len: 116
6658 + { skbaddr: ffff88005f3d5f00 } hitcount: 1 len: 1280
6659 + { skbaddr: ffff880100064700 } hitcount: 1 len: 365
6660 + { skbaddr: ffff8800badb6f00 } hitcount: 1 len: 60
6664 + { skbaddr: ffff88009fe0be00 } hitcount: 27 len: 24677
6665 + { skbaddr: ffff88009fe0a400 } hitcount: 27 len: 23052
6666 + { skbaddr: ffff88009fe0b700 } hitcount: 31 len: 25589
6667 + { skbaddr: ffff88009fe0b600 } hitcount: 32 len: 27326
6668 + { skbaddr: ffff88006a462800 } hitcount: 68 len: 71678
6669 + { skbaddr: ffff88006a463700 } hitcount: 70 len: 72678
6670 + { skbaddr: ffff88006a462b00 } hitcount: 71 len: 77589
6671 + { skbaddr: ffff88006a463600 } hitcount: 73 len: 71307
6672 + { skbaddr: ffff88006a462200 } hitcount: 81 len: 81032
6682 + # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active]
6694 + # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active]
6697 + { skbaddr: ffff88009fd2c300 } hitcount: 1 len: 7212
6698 + { skbaddr: ffff8800d2bcce00 } hitcount: 1 len: 7212
6699 + { skbaddr: ffff8800d2bcd700 } hitcount: 1 len: 7212
6700 + { skbaddr: ffff8800d2bcda00 } hitcount: 1 len: 21492
6701 + { skbaddr: ffff8800ae2e2d00 } hitcount: 1 len: 7212
6702 + { skbaddr: ffff8800d2bcdb00 } hitcount: 1 len: 7212
6703 + { skbaddr: ffff88006a4df500 } hitcount: 1 len: 4854
6704 + { skbaddr: ffff88008ce47b00 } hitcount: 1 len: 18636
6705 + { skbaddr: ffff8800ae2e2200 } hitcount: 1 len: 12924
6706 + { skbaddr: ffff88005f3e1000 } hitcount: 1 len: 4356
6707 + { skbaddr: ffff8800d2bcdc00 } hitcount: 2 len: 24420
6708 + { skbaddr: ffff8800d2bcc200 } hitcount: 2 len: 12996
6718 + # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active]
6727 + Named triggers can be used to have triggers share a common set of
6728 + histogram data. This capability is mostly useful for combining the
6729 + output of events generated by tracepoints contained inside inline
6730 + functions, but names can be used in a hist trigger on any event.
6731 + For example, these two triggers when hit will update the same 'len'
6732 + field in the shared 'foo' histogram data:
6734 + # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
6735 + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6736 + # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
6737 + /sys/kernel/debug/tracing/events/net/netif_rx/trigger
6739 + You can see that they're updating common histogram data by reading
6740 + each event's hist files at the same time:
6742 + # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist;
6743 + cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
6747 + # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
6750 + { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
6751 + { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
6752 + { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
6753 + { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
6754 + { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
6755 + { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
6756 + { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
6757 + { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
6758 + { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
6759 + { skbaddr: ffff880064505000 } hitcount: 1 len: 46
6760 + { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
6761 + { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
6762 + { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
6763 + { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
6764 + { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
6765 + { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
6766 + { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
6767 + { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
6768 + { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
6769 + { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
6770 + { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
6771 + { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
6772 + { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
6773 + { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
6774 + { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
6775 + { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
6776 + { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
6777 + { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
6778 + { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
6779 + { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
6780 + { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
6781 + { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
6782 + { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
6783 + { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
6784 + { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
6785 + { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
6786 + { skbaddr: ffff880064504400 } hitcount: 4 len: 184
6787 + { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
6788 + { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
6789 + { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
6790 + { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
6791 + { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
6799 + # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
6802 + { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
6803 + { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
6804 + { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
6805 + { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
6806 + { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
6807 + { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
6808 + { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
6809 + { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
6810 + { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
6811 + { skbaddr: ffff880064505000 } hitcount: 1 len: 46
6812 + { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
6813 + { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
6814 + { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
6815 + { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
6816 + { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
6817 + { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
6818 + { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
6819 + { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
6820 + { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
6821 + { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
6822 + { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
6823 + { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
6824 + { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
6825 + { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
6826 + { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
6827 + { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
6828 + { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
6829 + { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
6830 + { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
6831 + { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
6832 + { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
6833 + { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
6834 + { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
6835 + { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
6836 + { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
6837 + { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
6838 + { skbaddr: ffff880064504400 } hitcount: 4 len: 184
6839 + { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
6840 + { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
6841 + { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
6842 + { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
6843 + { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
6850 + And here's an example that shows how to combine histogram data from
6851 + any two events even if they don't share any 'compatible' fields
6852 + other than 'hitcount' and 'stacktrace'. These commands create a
6853 + couple of triggers named 'bar' using those fields:
6855 + # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
6856 + /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
6857 + # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
6858 + /sys/kernel/debug/tracing/events/net/netif_rx/trigger
6860 + And displaying the output of either shows some interesting if
6861 + somewhat confusing output:
6863 + # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
6864 + # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
6868 + # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active]
6872 + _do_fork+0x18e/0x330
6873 + kernel_thread+0x29/0x30
6874 + kthreadd+0x154/0x1b0
6875 + ret_from_fork+0x3f/0x70
6878 + netif_rx_internal+0xb2/0xd0
6879 + netif_rx_ni+0x20/0x70
6880 + dev_loopback_xmit+0xaa/0xd0
6881 + ip_mc_output+0x126/0x240
6882 + ip_local_out_sk+0x31/0x40
6883 + igmp_send_report+0x1e9/0x230
6884 + igmp_timer_expire+0xe9/0x120
6885 + call_timer_fn+0x39/0xf0
6886 + run_timer_softirq+0x1e1/0x290
6887 + __do_softirq+0xfd/0x290
6888 + irq_exit+0x98/0xb0
6889 + smp_apic_timer_interrupt+0x4a/0x60
6890 + apic_timer_interrupt+0x6d/0x80
6891 + cpuidle_enter+0x17/0x20
6892 + call_cpuidle+0x3b/0x60
6893 + cpu_startup_entry+0x22d/0x310
6896 + netif_rx_internal+0xb2/0xd0
6897 + netif_rx_ni+0x20/0x70
6898 + dev_loopback_xmit+0xaa/0xd0
6899 + ip_mc_output+0x17f/0x240
6900 + ip_local_out_sk+0x31/0x40
6901 + ip_send_skb+0x1a/0x50
6902 + udp_send_skb+0x13e/0x270
6903 + udp_sendmsg+0x2bf/0x980
6904 + inet_sendmsg+0x67/0xa0
6905 + sock_sendmsg+0x38/0x50
6906 + SYSC_sendto+0xef/0x170
6907 + SyS_sendto+0xe/0x10
6908 + entry_SYSCALL_64_fastpath+0x12/0x6a
6911 + netif_rx_internal+0xb2/0xd0
6912 + netif_rx+0x1c/0x60
6913 + loopback_xmit+0x6c/0xb0
6914 + dev_hard_start_xmit+0x219/0x3a0
6915 + __dev_queue_xmit+0x415/0x4f0
6916 + dev_queue_xmit_sk+0x13/0x20
6917 + ip_finish_output2+0x237/0x340
6918 + ip_finish_output+0x113/0x1d0
6919 + ip_output+0x66/0xc0
6920 + ip_local_out_sk+0x31/0x40
6921 + ip_send_skb+0x1a/0x50
6922 + udp_send_skb+0x16d/0x270
6923 + udp_sendmsg+0x2bf/0x980
6924 + inet_sendmsg+0x67/0xa0
6925 + sock_sendmsg+0x38/0x50
6926 + ___sys_sendmsg+0x14e/0x270
6929 + netif_rx_internal+0xb2/0xd0
6930 + netif_rx+0x1c/0x60
6931 + loopback_xmit+0x6c/0xb0
6932 + dev_hard_start_xmit+0x219/0x3a0
6933 + __dev_queue_xmit+0x415/0x4f0
6934 + dev_queue_xmit_sk+0x13/0x20
6935 + ip_finish_output2+0x237/0x340
6936 + ip_finish_output+0x113/0x1d0
6937 + ip_output+0x66/0xc0
6938 + ip_local_out_sk+0x31/0x40
6939 + ip_send_skb+0x1a/0x50
6940 + udp_send_skb+0x16d/0x270
6941 + udp_sendmsg+0x2bf/0x980
6942 + inet_sendmsg+0x67/0xa0
6943 + sock_sendmsg+0x38/0x50
6944 + ___sys_sendmsg+0x269/0x270
6947 + netif_rx_internal+0xb2/0xd0
6948 + netif_rx+0x1c/0x60
6949 + loopback_xmit+0x6c/0xb0
6950 + dev_hard_start_xmit+0x219/0x3a0
6951 + __dev_queue_xmit+0x415/0x4f0
6952 + dev_queue_xmit_sk+0x13/0x20
6953 + ip_finish_output2+0x237/0x340
6954 + ip_finish_output+0x113/0x1d0
6955 + ip_output+0x66/0xc0
6956 + ip_local_out_sk+0x31/0x40
6957 + ip_send_skb+0x1a/0x50
6958 + udp_send_skb+0x16d/0x270
6959 + udp_sendmsg+0x2bf/0x980
6960 + inet_sendmsg+0x67/0xa0
6961 + sock_sendmsg+0x38/0x50
6962 + SYSC_sendto+0xef/0x170
6965 + _do_fork+0x18e/0x330
6966 + SyS_clone+0x19/0x20
6967 + entry_SYSCALL_64_fastpath+0x12/0x6a
6976 +2.2 Inter-event hist triggers
6977 +-----------------------------
6979 +Inter-event hist triggers are hist triggers that combine values from
6980 +one or more other events and create a histogram using that data. Data
6981 +from an inter-event histogram can in turn become the source for
6982 +further combined histograms, thus providing a chain of related
6983 +histograms, which is important for some applications.
6985 +The most important example of an inter-event quantity that can be used
6986 +in this manner is latency, which is simply a difference in timestamps
6987 +between two events. Although latency is the most important
6988 +inter-event quantity, note that because the support is completely
6989 +general across the trace event subsystem, any event field can be used
6990 +in an inter-event quantity.
6992 +An example of a histogram that combines data from other histograms
6993 +into a useful chain would be a 'wakeupswitch latency' histogram that
6994 +combines a 'wakeup latency' histogram and a 'switch latency'
6997 +Normally, a hist trigger specification consists of a (possibly
6998 +compound) key along with one or more numeric values, which are
6999 +continually updated sums associated with that key. A histogram
7000 +specification in this case consists of individual key and value
7001 +specifications that refer to trace event fields associated with a
7004 +The inter-event hist trigger extension allows fields from multiple
7005 +events to be referenced and combined into a multi-event histogram
7006 +specification. In support of this overall goal, a few enabling
7007 +features have been added to the hist trigger support:
7009 + - In order to compute an inter-event quantity, a value from one
7010 + event needs to saved and then referenced from another event. This
7011 + requires the introduction of support for histogram 'variables'.
7013 + - The computation of inter-event quantities and their combination
7014 + require some minimal amount of support for applying simple
7015 + expressions to variables (+ and -).
7017 + - A histogram consisting of inter-event quantities isn't logically a
7018 + histogram on either event (so having the 'hist' file for either
7019 + event host the histogram output doesn't really make sense). To
7020 + address the idea that the histogram is associated with a
7021 + combination of events, support is added allowing the creation of
7022 + 'synthetic' events that are events derived from other events.
7023 + These synthetic events are full-fledged events just like any other
7024 + and can be used as such, as for instance to create the
7025 + 'combination' histograms mentioned previously.
7027 + - A set of 'actions' can be associated with histogram entries -
7028 + these can be used to generate the previously mentioned synthetic
7029 + events, but can also be used for other purposes, such as for
7030 + example saving context when a 'max' latency has been hit.
7032 + - Trace events don't have a 'timestamp' associated with them, but
7033 + there is an implicit timestamp saved along with an event in the
7034 + underlying ftrace ring buffer. This timestamp is now exposed as a
7035 + a synthetic field named 'common_timestamp' which can be used in
7036 + histograms as if it were any other event field; it isn't an actual
7037 + field in the trace format but rather is a synthesized value that
7038 + nonetheless can be used as if it were an actual field. By default
7039 + it is in units of nanoseconds; appending '.usecs' to a
7040 + common_timestamp field changes the units to microseconds.
7042 +A note on inter-event timestamps: If common_timestamp is used in a
7043 +histogram, the trace buffer is automatically switched over to using
7044 +absolute timestamps and the "global" trace clock, in order to avoid
7045 +bogus timestamp differences with other clocks that aren't coherent
7046 +across CPUs. This can be overridden by specifying one of the other
7047 +trace clocks instead, using the "clock=XXX" hist trigger attribute,
7048 +where XXX is any of the clocks listed in the tracing/trace_clock
7051 +These features are described in more detail in the following sections.
7053 +2.2.1 Histogram Variables
7054 +-------------------------
7056 +Variables are simply named locations used for saving and retrieving
7057 +values between matching events. A 'matching' event is defined as an
7058 +event that has a matching key - if a variable is saved for a histogram
7059 +entry corresponding to that key, any subsequent event with a matching
7060 +key can access that variable.
7062 +A variable's value is normally available to any subsequent event until
7063 +it is set to something else by a subsequent event. The one exception
7064 +to that rule is that any variable used in an expression is essentially
7065 +'read-once' - once it's used by an expression in a subsequent event,
7066 +it's reset to its 'unset' state, which means it can't be used again
7067 +unless it's set again. This ensures not only that an event doesn't
7068 +use an uninitialized variable in a calculation, but that that variable
7069 +is used only once and not for any unrelated subsequent match.
7071 +The basic syntax for saving a variable is to simply prefix a unique
7072 +variable name not corresponding to any keyword along with an '=' sign
7073 +to any event field.
7075 +Either keys or values can be saved and retrieved in this way. This
7076 +creates a variable named 'ts0' for a histogram entry with the key
7079 + # echo 'hist:keys=next_pid:vals=$ts0:ts0=common_timestamp ... >> \
7082 +The ts0 variable can be accessed by any subsequent event having the
7083 +same pid as 'next_pid'.
7085 +Variable references are formed by prepending the variable name with
7086 +the '$' sign. Thus for example, the ts0 variable above would be
7087 +referenced as '$ts0' in expressions.
7089 +Because 'vals=' is used, the common_timestamp variable value above
7090 +will also be summed as a normal histogram value would (though for a
7091 +timestamp it makes little sense).
7093 +The below shows that a key value can also be saved in the same way:
7095 + # echo 'hist:timer_pid=common_pid:key=timer_pid ...' >> event/trigger
7097 +If a variable isn't a key variable or prefixed with 'vals=', the
7098 +associated event field will be saved in a variable but won't be summed
7101 + # echo 'hist:keys=next_pid:ts1=common_timestamp ... >> event/trigger
7103 +Multiple variables can be assigned at the same time. The below would
7104 +result in both ts0 and b being created as variables, with both
7105 +common_timestamp and field1 additionally being summed as values:
7107 + # echo 'hist:keys=pid:vals=$ts0,$b:ts0=common_timestamp,b=field1 ... >> \
7110 +Note that variable assignments can appear either preceding or
7111 +following their use. The command below behaves identically to the
7114 + # echo 'hist:keys=pid:ts0=common_timestamp,b=field1:vals=$ts0,$b ... >> \
7117 +Any number of variables not bound to a 'vals=' prefix can also be
7118 +assigned by simply separating them with colons. Below is the same
7119 +thing but without the values being summed in the histogram:
7121 + # echo 'hist:keys=pid:ts0=common_timestamp:b=field1 ... >> event/trigger
7123 +Variables set as above can be referenced and used in expressions on
7126 +For example, here's how a latency can be calculated:
7128 + # echo 'hist:keys=pid,prio:ts0=common_timestamp ... >> event1/trigger
7129 + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp-$ts0 ... >> event2/trigger
7131 +In the first line above, the event's timetamp is saved into the
7132 +variable ts0. In the next line, ts0 is subtracted from the second
7133 +event's timestamp to produce the latency, which is then assigned into
7134 +yet another variable, 'wakeup_lat'. The hist trigger below in turn
7135 +makes use of the wakeup_lat variable to compute a combined latency
7136 +using the same key and variable from yet another event:
7138 + # echo 'hist:key=pid:wakeupswitch_lat=$wakeup_lat+$switchtime_lat ... >> event3/trigger
7140 +2.2.2 Synthetic Events
7141 +----------------------
7143 +Synthetic events are user-defined events generated from hist trigger
7144 +variables or fields associated with one or more other events. Their
7145 +purpose is to provide a mechanism for displaying data spanning
7146 +multiple events consistent with the existing and already familiar
7147 +usage for normal events.
7149 +To define a synthetic event, the user writes a simple specification
7150 +consisting of the name of the new event along with one or more
7151 +variables and their types, which can be any valid field type,
7152 +separated by semicolons, to the tracing/synthetic_events file.
7154 +For instance, the following creates a new event named 'wakeup_latency'
7155 +with 3 fields: lat, pid, and prio. Each of those fields is simply a
7156 +variable reference to a variable on another event:
7158 + # echo 'wakeup_latency \
7162 + /sys/kernel/debug/tracing/synthetic_events
7164 +Reading the tracing/synthetic_events file lists all the currently
7165 +defined synthetic events, in this case the event defined above:
7167 + # cat /sys/kernel/debug/tracing/synthetic_events
7168 + wakeup_latency u64 lat; pid_t pid; int prio
7170 +An existing synthetic event definition can be removed by prepending
7171 +the command that defined it with a '!':
7173 + # echo '!wakeup_latency u64 lat pid_t pid int prio' >> \
7174 + /sys/kernel/debug/tracing/synthetic_events
7176 +At this point, there isn't yet an actual 'wakeup_latency' event
7177 +instantiated in the event subsytem - for this to happen, a 'hist
7178 +trigger action' needs to be instantiated and bound to actual fields
7179 +and variables defined on other events (see Section 6.3.3 below).
7181 +Once that is done, an event instance is created, and a histogram can
7182 +be defined using it:
7184 + # echo 'hist:keys=pid,prio,lat.log2:sort=pid,lat' >> \
7185 + /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger
7187 +The new event is created under the tracing/events/synthetic/ directory
7188 +and looks and behaves just like any other event:
7190 + # ls /sys/kernel/debug/tracing/events/synthetic/wakeup_latency
7191 + enable filter format hist id trigger
7193 +Like any other event, once a histogram is enabled for the event, the
7194 +output can be displayed by reading the event's 'hist' file.
7196 +2.2.3 Hist trigger 'actions'
7197 +----------------------------
7199 +A hist trigger 'action' is a function that's executed whenever a
7200 +histogram entry is added or updated.
7202 +The default 'action' if no special function is explicity specified is
7203 +as it always has been, to simply update the set of values associated
7204 +with an entry. Some applications, however, may want to perform
7205 +additional actions at that point, such as generate another event, or
7206 +compare and save a maximum.
7208 +The following additional actions are available. To specify an action
7209 +for a given event, simply specify the action between colons in the
7210 +hist trigger specification.
7212 + - onmatch(matching.event).<synthetic_event_name>(param list)
7214 + The 'onmatch(matching.event).<synthetic_event_name>(params)' hist
7215 + trigger action is invoked whenever an event matches and the
7216 + histogram entry would be added or updated. It causes the named
7217 + synthetic event to be generated with the values given in the
7218 + 'param list'. The result is the generation of a synthetic event
7219 + that consists of the values contained in those variables at the
7220 + time the invoking event was hit.
7222 + The 'param list' consists of one or more parameters which may be
7223 + either variables or fields defined on either the 'matching.event'
7224 + or the target event. The variables or fields specified in the
7225 + param list may be either fully-qualified or unqualified. If a
7226 + variable is specified as unqualified, it must be unique between
7227 + the two events. A field name used as a param can be unqualified
7228 + if it refers to the target event, but must be fully qualified if
7229 + it refers to the matching event. A fully-qualified name is of the
7230 + form 'system.event_name.$var_name' or 'system.event_name.field'.
7232 + The 'matching.event' specification is simply the fully qualified
7233 + event name of the event that matches the target event for the
7234 + onmatch() functionality, in the form 'system.event_name'.
7236 + Finally, the number and type of variables/fields in the 'param
7237 + list' must match the number and types of the fields in the
7238 + synthetic event being generated.
7240 + As an example the below defines a simple synthetic event and uses
7241 + a variable defined on the sched_wakeup_new event as a parameter
7242 + when invoking the synthetic event. Here we define the synthetic
7245 + # echo 'wakeup_new_test pid_t pid' >> \
7246 + /sys/kernel/debug/tracing/synthetic_events
7248 + # cat /sys/kernel/debug/tracing/synthetic_events
7249 + wakeup_new_test pid_t pid
7251 + The following hist trigger both defines the missing testpid
7252 + variable and specifies an onmatch() action that generates a
7253 + wakeup_new_test synthetic event whenever a sched_wakeup_new event
7254 + occurs, which because of the 'if comm == "cyclictest"' filter only
7255 + happens when the executable is cyclictest:
7257 + # echo 'hist:keys=$testpid:testpid=pid:onmatch(sched.sched_wakeup_new).\
7258 + wakeup_new_test($testpid) if comm=="cyclictest"' >> \
7259 + /sys/kernel/debug/tracing/events/sched/sched_wakeup_new/trigger
7261 + Creating and displaying a histogram based on those events is now
7262 + just a matter of using the fields and new synthetic event in the
7263 + tracing/events/synthetic directory, as usual:
7265 + # echo 'hist:keys=pid:sort=pid' >> \
7266 + /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/trigger
7268 + Running 'cyclictest' should cause wakeup_new events to generate
7269 + wakeup_new_test synthetic events which should result in histogram
7270 + output in the wakeup_new_test event's hist file:
7272 + # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/hist
7274 + A more typical usage would be to use two events to calculate a
7275 + latency. The following example uses a set of hist triggers to
7276 + produce a 'wakeup_latency' histogram:
7278 + First, we define a 'wakeup_latency' synthetic event:
7280 + # echo 'wakeup_latency u64 lat; pid_t pid; int prio' >> \
7281 + /sys/kernel/debug/tracing/synthetic_events
7283 + Next, we specify that whenever we see a sched_waking event for a
7284 + cyclictest thread, save the timestamp in a 'ts0' variable:
7286 + # echo 'hist:keys=$saved_pid:saved_pid=pid:ts0=common_timestamp.usecs \
7287 + if comm=="cyclictest"' >> \
7288 + /sys/kernel/debug/tracing/events/sched/sched_waking/trigger
7290 + Then, when the corresponding thread is actually scheduled onto the
7291 + CPU by a sched_switch event, calculate the latency and use that
7292 + along with another variable and an event field to generate a
7293 + wakeup_latency synthetic event:
7295 + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:\
7296 + onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,\
7297 + $saved_pid,next_prio) if next_comm=="cyclictest"' >> \
7298 + /sys/kernel/debug/tracing/events/sched/sched_switch/trigger
7300 + We also need to create a histogram on the wakeup_latency synthetic
7301 + event in order to aggregate the generated synthetic event data:
7303 + # echo 'hist:keys=pid,prio,lat:sort=pid,lat' >> \
7304 + /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger
7306 + Finally, once we've run cyclictest to actually generate some
7307 + events, we can see the output by looking at the wakeup_latency
7308 + synthetic event's hist file:
7310 + # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/hist
7312 + - onmax(var).save(field,.. .)
7314 + The 'onmax(var).save(field,...)' hist trigger action is invoked
7315 + whenever the value of 'var' associated with a histogram entry
7316 + exceeds the current maximum contained in that variable.
7318 + The end result is that the trace event fields specified as the
7319 + onmax.save() params will be saved if 'var' exceeds the current
7320 + maximum for that hist trigger entry. This allows context from the
7321 + event that exhibited the new maximum to be saved for later
7322 + reference. When the histogram is displayed, additional fields
7323 + displaying the saved values will be printed.
7325 + As an example the below defines a couple of hist triggers, one for
7326 + sched_waking and another for sched_switch, keyed on pid. Whenever
7327 + a sched_waking occurs, the timestamp is saved in the entry
7328 + corresponding to the current pid, and when the scheduler switches
7329 + back to that pid, the timestamp difference is calculated. If the
7330 + resulting latency, stored in wakeup_lat, exceeds the current
7331 + maximum latency, the values specified in the save() fields are
7334 + # echo 'hist:keys=pid:ts0=common_timestamp.usecs \
7335 + if comm=="cyclictest"' >> \
7336 + /sys/kernel/debug/tracing/events/sched/sched_waking/trigger
7338 + # echo 'hist:keys=next_pid:\
7339 + wakeup_lat=common_timestamp.usecs-$ts0:\
7340 + onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) \
7341 + if next_comm=="cyclictest"' >> \
7342 + /sys/kernel/debug/tracing/events/sched/sched_switch/trigger
7344 + When the histogram is displayed, the max value and the saved
7345 + values corresponding to the max are displayed following the rest
7348 + # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist
7349 + { next_pid: 2255 } hitcount: 239
7350 + common_timestamp-ts0: 0
7352 + next_comm: cyclictest
7353 + prev_pid: 0 prev_prio: 120 prev_comm: swapper/1
7355 + { next_pid: 2256 } hitcount: 2355
7356 + common_timestamp-ts0: 0
7357 + max: 49 next_comm: cyclictest
7358 + prev_pid: 0 prev_prio: 120 prev_comm: swapper/0
7364 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/acpi/acpica/acglobal.h linux-4.14/drivers/acpi/acpica/acglobal.h
7365 --- linux-4.14.orig/drivers/acpi/acpica/acglobal.h 2017-11-12 19:46:13.000000000 +0100
7366 +++ linux-4.14/drivers/acpi/acpica/acglobal.h 2018-09-05 11:05:07.000000000 +0200
7370 ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
7371 -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock); /* For ACPI H/W except GPE registers */
7372 +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock); /* For ACPI H/W except GPE registers */
7373 ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
7375 /* Mutex for _OSI support */
7376 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/acpi/acpica/hwregs.c linux-4.14/drivers/acpi/acpica/hwregs.c
7377 --- linux-4.14.orig/drivers/acpi/acpica/hwregs.c 2017-11-12 19:46:13.000000000 +0100
7378 +++ linux-4.14/drivers/acpi/acpica/hwregs.c 2018-09-05 11:05:07.000000000 +0200
7379 @@ -428,14 +428,14 @@
7380 ACPI_BITMASK_ALL_FIXED_STATUS,
7381 ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
7383 - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
7384 + raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
7386 /* Clear the fixed events in PM1 A/B */
7388 status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
7389 ACPI_BITMASK_ALL_FIXED_STATUS);
7391 - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
7392 + raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
7394 if (ACPI_FAILURE(status)) {
7396 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/acpi/acpica/hwxface.c linux-4.14/drivers/acpi/acpica/hwxface.c
7397 --- linux-4.14.orig/drivers/acpi/acpica/hwxface.c 2017-11-12 19:46:13.000000000 +0100
7398 +++ linux-4.14/drivers/acpi/acpica/hwxface.c 2018-09-05 11:05:07.000000000 +0200
7400 return_ACPI_STATUS(AE_BAD_PARAMETER);
7403 - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
7404 + raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
7407 * At this point, we know that the parent register is one of the
7412 - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
7413 + raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
7414 return_ACPI_STATUS(status);
7417 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/acpi/acpica/utmutex.c linux-4.14/drivers/acpi/acpica/utmutex.c
7418 --- linux-4.14.orig/drivers/acpi/acpica/utmutex.c 2017-11-12 19:46:13.000000000 +0100
7419 +++ linux-4.14/drivers/acpi/acpica/utmutex.c 2018-09-05 11:05:07.000000000 +0200
7421 return_ACPI_STATUS (status);
7424 - status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
7425 + status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
7426 if (ACPI_FAILURE (status)) {
7427 return_ACPI_STATUS (status);
7430 /* Delete the spinlocks */
7432 acpi_os_delete_lock(acpi_gbl_gpe_lock);
7433 - acpi_os_delete_lock(acpi_gbl_hardware_lock);
7434 + acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
7435 acpi_os_delete_lock(acpi_gbl_reference_count_lock);
7437 /* Delete the reader/writer lock */
7438 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ata/libata-sff.c linux-4.14/drivers/ata/libata-sff.c
7439 --- linux-4.14.orig/drivers/ata/libata-sff.c 2017-11-12 19:46:13.000000000 +0100
7440 +++ linux-4.14/drivers/ata/libata-sff.c 2018-09-05 11:05:07.000000000 +0200
7442 unsigned long flags;
7443 unsigned int consumed;
7445 - local_irq_save(flags);
7446 + local_irq_save_nort(flags);
7447 consumed = ata_sff_data_xfer32(qc, buf, buflen, rw);
7448 - local_irq_restore(flags);
7449 + local_irq_restore_nort(flags);
7453 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/block/brd.c linux-4.14/drivers/block/brd.c
7454 --- linux-4.14.orig/drivers/block/brd.c 2017-11-12 19:46:13.000000000 +0100
7455 +++ linux-4.14/drivers/block/brd.c 2018-09-05 11:05:07.000000000 +0200
7458 * Look up and return a brd's page for a given sector.
7460 -static DEFINE_MUTEX(brd_mutex);
7461 static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
7464 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/block/zram/zcomp.c linux-4.14/drivers/block/zram/zcomp.c
7465 --- linux-4.14.orig/drivers/block/zram/zcomp.c 2017-11-12 19:46:13.000000000 +0100
7466 +++ linux-4.14/drivers/block/zram/zcomp.c 2018-09-05 11:05:07.000000000 +0200
7467 @@ -116,12 +116,20 @@
7469 struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
7471 - return *get_cpu_ptr(comp->stream);
7472 + struct zcomp_strm *zstrm;
7474 + zstrm = *get_local_ptr(comp->stream);
7475 + spin_lock(&zstrm->zcomp_lock);
7479 void zcomp_stream_put(struct zcomp *comp)
7481 - put_cpu_ptr(comp->stream);
7482 + struct zcomp_strm *zstrm;
7484 + zstrm = *this_cpu_ptr(comp->stream);
7485 + spin_unlock(&zstrm->zcomp_lock);
7486 + put_local_ptr(zstrm);
7489 int zcomp_compress(struct zcomp_strm *zstrm,
7491 pr_err("Can't allocate a compression stream\n");
7494 + spin_lock_init(&zstrm->zcomp_lock);
7495 *per_cpu_ptr(comp->stream, cpu) = zstrm;
7498 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/block/zram/zcomp.h linux-4.14/drivers/block/zram/zcomp.h
7499 --- linux-4.14.orig/drivers/block/zram/zcomp.h 2017-11-12 19:46:13.000000000 +0100
7500 +++ linux-4.14/drivers/block/zram/zcomp.h 2018-09-05 11:05:07.000000000 +0200
7502 /* compression/decompression buffer */
7504 struct crypto_comp *tfm;
7505 + spinlock_t zcomp_lock;
7508 /* dynamic per-device compression frontend */
7509 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/block/zram/zram_drv.c linux-4.14/drivers/block/zram/zram_drv.c
7510 --- linux-4.14.orig/drivers/block/zram/zram_drv.c 2017-11-12 19:46:13.000000000 +0100
7511 +++ linux-4.14/drivers/block/zram/zram_drv.c 2018-09-05 11:05:07.000000000 +0200
7512 @@ -756,6 +756,30 @@
7513 static DEVICE_ATTR_RO(mm_stat);
7514 static DEVICE_ATTR_RO(debug_stat);
7516 +#ifdef CONFIG_PREEMPT_RT_BASE
7517 +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages)
7521 + for (index = 0; index < num_pages; index++)
7522 + spin_lock_init(&zram->table[index].lock);
7525 +static void zram_slot_lock(struct zram *zram, u32 index)
7527 + spin_lock(&zram->table[index].lock);
7528 + __set_bit(ZRAM_ACCESS, &zram->table[index].value);
7531 +static void zram_slot_unlock(struct zram *zram, u32 index)
7533 + __clear_bit(ZRAM_ACCESS, &zram->table[index].value);
7534 + spin_unlock(&zram->table[index].lock);
7538 +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { }
7540 static void zram_slot_lock(struct zram *zram, u32 index)
7542 bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value);
7545 bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value);
7549 static void zram_meta_free(struct zram *zram, u64 disksize)
7555 + zram_meta_init_table_locks(zram, num_pages);
7560 unsigned long handle;
7563 + struct zcomp_strm *zstrm;
7565 if (zram_wb_enabled(zram)) {
7566 zram_slot_lock(zram, index);
7569 size = zram_get_obj_size(zram, index);
7571 + zstrm = zcomp_stream_get(zram->comp);
7572 src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
7573 if (size == PAGE_SIZE) {
7574 dst = kmap_atomic(page);
7575 @@ -886,14 +914,13 @@
7579 - struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
7581 dst = kmap_atomic(page);
7582 ret = zcomp_decompress(zstrm, src, size, dst);
7584 - zcomp_stream_put(zram->comp);
7586 zs_unmap_object(zram->mem_pool, handle);
7587 + zcomp_stream_put(zram->comp);
7588 zram_slot_unlock(zram, index);
7590 /* Should NEVER happen. Return bio error if it does. */
7591 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/block/zram/zram_drv.h linux-4.14/drivers/block/zram/zram_drv.h
7592 --- linux-4.14.orig/drivers/block/zram/zram_drv.h 2017-11-12 19:46:13.000000000 +0100
7593 +++ linux-4.14/drivers/block/zram/zram_drv.h 2018-09-05 11:05:07.000000000 +0200
7595 unsigned long element;
7597 unsigned long value;
7598 +#ifdef CONFIG_PREEMPT_RT_BASE
7604 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/char/random.c linux-4.14/drivers/char/random.c
7605 --- linux-4.14.orig/drivers/char/random.c 2018-09-05 11:03:20.000000000 +0200
7606 +++ linux-4.14/drivers/char/random.c 2018-09-05 11:05:07.000000000 +0200
7608 #include <linux/syscalls.h>
7609 #include <linux/completion.h>
7610 #include <linux/uuid.h>
7611 +#include <linux/locallock.h>
7612 #include <crypto/chacha20.h>
7614 #include <asm/processor.h>
7616 invalidate_batched_entropy();
7618 wake_up_interruptible(&crng_init_wait);
7619 - pr_notice("random: fast init done\n");
7620 + /* pr_notice("random: fast init done\n"); */
7624 @@ -941,17 +942,21 @@
7626 process_random_ready_list();
7627 wake_up_interruptible(&crng_init_wait);
7628 - pr_notice("random: crng init done\n");
7629 + /* pr_notice("random: crng init done\n"); */
7630 if (unseeded_warning.missed) {
7632 pr_notice("random: %d get_random_xx warning(s) missed "
7633 "due to ratelimiting\n",
7634 unseeded_warning.missed);
7636 unseeded_warning.missed = 0;
7638 if (urandom_warning.missed) {
7640 pr_notice("random: %d urandom warning(s) missed "
7641 "due to ratelimiting\n",
7642 urandom_warning.missed);
7644 urandom_warning.missed = 0;
7647 @@ -1122,8 +1127,6 @@
7649 long delta, delta2, delta3;
7651 - preempt_disable();
7653 sample.jiffies = jiffies;
7654 sample.cycles = random_get_entropy();
7656 @@ -1164,7 +1167,6 @@
7658 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
7663 void add_input_randomness(unsigned int type, unsigned int code,
7664 @@ -1221,28 +1223,27 @@
7668 -void add_interrupt_randomness(int irq, int irq_flags)
7669 +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
7671 struct entropy_store *r;
7672 struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
7673 - struct pt_regs *regs = get_irq_regs();
7674 unsigned long now = jiffies;
7675 cycles_t cycles = random_get_entropy();
7676 __u32 c_high, j_high;
7682 - cycles = get_reg(fast_pool, regs);
7683 + cycles = get_reg(fast_pool, NULL);
7684 c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
7685 j_high = (sizeof(now) > 4) ? now >> 32 : 0;
7686 fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
7687 fast_pool->pool[1] ^= now ^ c_high;
7688 - ip = regs ? instruction_pointer(regs) : _RET_IP_;
7691 fast_pool->pool[2] ^= ip;
7692 fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
7693 - get_reg(fast_pool, regs);
7694 + get_reg(fast_pool, NULL);
7696 fast_mix(fast_pool);
7697 add_interrupt_bench(cycles);
7698 @@ -2200,6 +2201,7 @@
7699 * at any point prior.
7701 static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u64);
7702 +static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_u64_lock);
7703 u64 get_random_u64(void)
7706 @@ -2220,7 +2222,7 @@
7707 warn_unseeded_randomness(&previous);
7709 use_lock = READ_ONCE(crng_init) < 2;
7710 - batch = &get_cpu_var(batched_entropy_u64);
7711 + batch = &get_locked_var(batched_entropy_u64_lock, batched_entropy_u64);
7713 read_lock_irqsave(&batched_entropy_reset_lock, flags);
7714 if (batch->position % ARRAY_SIZE(batch->entropy_u64) == 0) {
7715 @@ -2230,12 +2232,13 @@
7716 ret = batch->entropy_u64[batch->position++];
7718 read_unlock_irqrestore(&batched_entropy_reset_lock, flags);
7719 - put_cpu_var(batched_entropy_u64);
7720 + put_locked_var(batched_entropy_u64_lock, batched_entropy_u64);
7723 EXPORT_SYMBOL(get_random_u64);
7725 static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u32);
7726 +static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_u32_lock);
7727 u32 get_random_u32(void)
7730 @@ -2250,7 +2253,7 @@
7731 warn_unseeded_randomness(&previous);
7733 use_lock = READ_ONCE(crng_init) < 2;
7734 - batch = &get_cpu_var(batched_entropy_u32);
7735 + batch = &get_locked_var(batched_entropy_u32_lock, batched_entropy_u32);
7737 read_lock_irqsave(&batched_entropy_reset_lock, flags);
7738 if (batch->position % ARRAY_SIZE(batch->entropy_u32) == 0) {
7739 @@ -2260,7 +2263,7 @@
7740 ret = batch->entropy_u32[batch->position++];
7742 read_unlock_irqrestore(&batched_entropy_reset_lock, flags);
7743 - put_cpu_var(batched_entropy_u32);
7744 + put_locked_var(batched_entropy_u32_lock, batched_entropy_u32);
7747 EXPORT_SYMBOL(get_random_u32);
7748 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/char/tpm/tpm_tis.c linux-4.14/drivers/char/tpm/tpm_tis.c
7749 --- linux-4.14.orig/drivers/char/tpm/tpm_tis.c 2018-09-05 11:03:20.000000000 +0200
7750 +++ linux-4.14/drivers/char/tpm/tpm_tis.c 2018-09-05 11:05:07.000000000 +0200
7752 return container_of(data, struct tpm_tis_tcg_phy, priv);
7755 +#ifdef CONFIG_PREEMPT_RT_FULL
7757 + * Flushes previous write operations to chip so that a subsequent
7758 + * ioread*()s won't stall a cpu.
7760 +static inline void tpm_tis_flush(void __iomem *iobase)
7762 + ioread8(iobase + TPM_ACCESS(0));
7765 +#define tpm_tis_flush(iobase) do { } while (0)
7768 +static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr)
7770 + iowrite8(b, iobase + addr);
7771 + tpm_tis_flush(iobase);
7774 +static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr)
7776 + iowrite32(b, iobase + addr);
7777 + tpm_tis_flush(iobase);
7780 static bool interrupts = true;
7781 module_param(interrupts, bool, 0444);
7782 MODULE_PARM_DESC(interrupts, "Enable interrupts");
7784 struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data);
7787 - iowrite8(*value++, phy->iobase + addr);
7788 + tpm_tis_iowrite8(*value++, phy->iobase, addr);
7794 struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data);
7796 - iowrite32(value, phy->iobase + addr);
7797 + tpm_tis_iowrite32(value, phy->iobase, addr);
7801 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/clocksource/tcb_clksrc.c linux-4.14/drivers/clocksource/tcb_clksrc.c
7802 --- linux-4.14.orig/drivers/clocksource/tcb_clksrc.c 2017-11-12 19:46:13.000000000 +0100
7803 +++ linux-4.14/drivers/clocksource/tcb_clksrc.c 2018-09-05 11:05:07.000000000 +0200
7805 * this 32 bit free-running counter. the second channel is not used.
7807 * - The third channel may be used to provide a 16-bit clockevent
7808 - * source, used in either periodic or oneshot mode. This runs
7809 - * at 32 KiHZ, and can handle delays of up to two seconds.
7810 + * source, used in either periodic or oneshot mode.
7812 * A boot clocksource and clockevent source are also currently needed,
7813 * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
7815 struct tc_clkevt_device {
7816 struct clock_event_device clkevt;
7823 @@ -134,15 +135,26 @@
7824 return container_of(clkevt, struct tc_clkevt_device, clkevt);
7827 -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
7828 - * because using one of the divided clocks would usually mean the
7829 - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
7831 - * A divided clock could be good for high resolution timers, since
7832 - * 30.5 usec resolution can seem "low".
7834 static u32 timer_clock;
7836 +static void tc_clk_disable(struct clock_event_device *d)
7838 + struct tc_clkevt_device *tcd = to_tc_clkevt(d);
7840 + clk_disable(tcd->clk);
7841 + tcd->clk_enabled = false;
7844 +static void tc_clk_enable(struct clock_event_device *d)
7846 + struct tc_clkevt_device *tcd = to_tc_clkevt(d);
7848 + if (tcd->clk_enabled)
7850 + clk_enable(tcd->clk);
7851 + tcd->clk_enabled = true;
7854 static int tc_shutdown(struct clock_event_device *d)
7856 struct tc_clkevt_device *tcd = to_tc_clkevt(d);
7857 @@ -150,8 +162,14 @@
7859 writel(0xff, regs + ATMEL_TC_REG(2, IDR));
7860 writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
7864 +static int tc_shutdown_clk_off(struct clock_event_device *d)
7867 if (!clockevent_state_detached(d))
7868 - clk_disable(tcd->clk);
7869 + tc_clk_disable(d);
7874 if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
7877 - clk_enable(tcd->clk);
7880 - /* slow clock, count up to RC, then irq and stop */
7881 + /* count up to RC, then irq and stop */
7882 writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
7883 ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
7884 writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
7885 @@ -186,12 +204,12 @@
7886 /* By not making the gentime core emulate periodic mode on top
7887 * of oneshot, we get lower overhead and improved accuracy.
7889 - clk_enable(tcd->clk);
7892 - /* slow clock, count up to RC, then irq and restart */
7893 + /* count up to RC, then irq and restart */
7894 writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
7895 regs + ATMEL_TC_REG(2, CMR));
7896 - writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
7897 + writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
7899 /* Enable clock and interrupts on RC compare */
7900 writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
7901 @@ -218,9 +236,13 @@
7902 .features = CLOCK_EVT_FEAT_PERIODIC |
7903 CLOCK_EVT_FEAT_ONESHOT,
7904 /* Should be lower than at91rm9200's system timer */
7905 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
7910 .set_next_event = tc_next_event,
7911 - .set_state_shutdown = tc_shutdown,
7912 + .set_state_shutdown = tc_shutdown_clk_off,
7913 .set_state_periodic = tc_set_periodic,
7914 .set_state_oneshot = tc_set_oneshot,
7920 -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
7921 +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
7923 + unsigned divisor = atmel_tc_divisors[divisor_idx];
7925 struct clk *t2_clk = tc->clk[2];
7926 int irq = tc->irq[2];
7927 @@ -262,7 +285,11 @@
7928 clkevt.regs = tc->regs;
7929 clkevt.clk = t2_clk;
7931 - timer_clock = clk32k_divisor_idx;
7932 + timer_clock = divisor_idx;
7934 + clkevt.freq = 32768;
7936 + clkevt.freq = clk_get_rate(t2_clk) / divisor;
7938 clkevt.clkevt.cpumask = cpumask_of(0);
7944 - clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
7945 + clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
7949 @@ -410,7 +437,11 @@
7950 goto err_disable_t1;
7952 /* channel 2: periodic and oneshot timer support */
7953 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
7954 ret = setup_clkevents(tc, clk32k_divisor_idx);
7956 + ret = setup_clkevents(tc, best_divisor_idx);
7959 goto err_unregister_clksrc;
7961 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/clocksource/timer-atmel-pit.c linux-4.14/drivers/clocksource/timer-atmel-pit.c
7962 --- linux-4.14.orig/drivers/clocksource/timer-atmel-pit.c 2017-11-12 19:46:13.000000000 +0100
7963 +++ linux-4.14/drivers/clocksource/timer-atmel-pit.c 2018-09-05 11:05:07.000000000 +0200
7968 + bool irq_requested;
7974 /* disable irq, leaving the clocksource active */
7975 pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
7976 + if (data->irq_requested) {
7977 + free_irq(data->irq, data);
7978 + data->irq_requested = false;
7983 +static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
7985 * Clockevent device: interrupts every 1/HZ (== pit_cycles * MCK/16)
7987 static int pit_clkevt_set_periodic(struct clock_event_device *dev)
7989 struct pit_data *data = clkevt_to_pit_data(dev);
7992 + ret = request_irq(data->irq, at91sam926x_pit_interrupt,
7993 + IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
7994 + "at91_tick", data);
7996 + panic(pr_fmt("Unable to setup IRQ\n"));
7998 + data->irq_requested = true;
8000 /* update clocksource counter */
8001 data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
8002 @@ -230,15 +245,6 @@
8006 - /* Set up irq handler */
8007 - ret = request_irq(data->irq, at91sam926x_pit_interrupt,
8008 - IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8009 - "at91_tick", data);
8011 - pr_err("Unable to setup IRQ\n");
8015 /* Set up and register clockevents */
8016 data->clkevt.name = "pit";
8017 data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
8018 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/clocksource/timer-atmel-st.c linux-4.14/drivers/clocksource/timer-atmel-st.c
8019 --- linux-4.14.orig/drivers/clocksource/timer-atmel-st.c 2017-11-12 19:46:13.000000000 +0100
8020 +++ linux-4.14/drivers/clocksource/timer-atmel-st.c 2018-09-05 11:05:07.000000000 +0200
8021 @@ -115,18 +115,29 @@
8022 last_crtr = read_CRTR();
8025 +static int atmel_st_irq;
8027 static int clkevt32k_shutdown(struct clock_event_device *evt)
8029 clkdev32k_disable_and_flush_irq();
8031 regmap_write(regmap_st, AT91_ST_IER, irqmask);
8032 + free_irq(atmel_st_irq, regmap_st);
8036 static int clkevt32k_set_oneshot(struct clock_event_device *dev)
8040 clkdev32k_disable_and_flush_irq();
8042 + ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8043 + IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8044 + "at91_tick", regmap_st);
8046 + panic(pr_fmt("Unable to setup IRQ\n"));
8049 * ALM for oneshot irqs, set by next_event()
8050 * before 32 seconds have passed.
8051 @@ -139,8 +150,16 @@
8053 static int clkevt32k_set_periodic(struct clock_event_device *dev)
8057 clkdev32k_disable_and_flush_irq();
8059 + ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8060 + IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8061 + "at91_tick", regmap_st);
8063 + panic(pr_fmt("Unable to setup IRQ\n"));
8065 /* PIT for periodic irqs; fixed rate of 1/HZ */
8066 irqmask = AT91_ST_PITS;
8067 regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
8071 unsigned int sclk_rate, val;
8075 regmap_st = syscon_node_to_regmap(node);
8076 if (IS_ERR(regmap_st)) {
8077 @@ -212,21 +231,12 @@
8078 regmap_read(regmap_st, AT91_ST_SR, &val);
8080 /* Get the interrupts property */
8081 - irq = irq_of_parse_and_map(node, 0);
8083 + atmel_st_irq = irq_of_parse_and_map(node, 0);
8084 + if (!atmel_st_irq) {
8085 pr_err("Unable to get IRQ from DT\n");
8089 - /* Make IRQs happen for the system timer */
8090 - ret = request_irq(irq, at91rm9200_timer_interrupt,
8091 - IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8092 - "at91_tick", regmap_st);
8094 - pr_err("Unable to setup IRQ\n");
8098 sclk = of_clk_get(node, 0);
8100 pr_err("Unable to get slow clock\n");
8101 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/connector/cn_proc.c linux-4.14/drivers/connector/cn_proc.c
8102 --- linux-4.14.orig/drivers/connector/cn_proc.c 2017-11-12 19:46:13.000000000 +0100
8103 +++ linux-4.14/drivers/connector/cn_proc.c 2018-09-05 11:05:07.000000000 +0200
8105 #include <linux/pid_namespace.h>
8107 #include <linux/cn_proc.h>
8108 +#include <linux/locallock.h>
8111 * Size of a cn_msg followed by a proc_event structure. Since the
8114 /* proc_event_counts is used as the sequence number of the netlink message */
8115 static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
8116 +static DEFINE_LOCAL_IRQ_LOCK(send_msg_lock);
8118 static inline void send_msg(struct cn_msg *msg)
8120 - preempt_disable();
8121 + local_lock(send_msg_lock);
8123 msg->seq = __this_cpu_inc_return(proc_event_counts) - 1;
8124 ((struct proc_event *)msg->data)->cpu = smp_processor_id();
8127 cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT);
8130 + local_unlock(send_msg_lock);
8133 void proc_fork_connector(struct task_struct *task)
8134 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/cpufreq/Kconfig.x86 linux-4.14/drivers/cpufreq/Kconfig.x86
8135 --- linux-4.14.orig/drivers/cpufreq/Kconfig.x86 2017-11-12 19:46:13.000000000 +0100
8136 +++ linux-4.14/drivers/cpufreq/Kconfig.x86 2018-09-05 11:05:07.000000000 +0200
8139 config X86_POWERNOW_K8
8140 tristate "AMD Opteron/Athlon64 PowerNow!"
8141 - depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
8142 + depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
8144 This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
8145 Support for K10 and newer processors is now in acpi-cpufreq.
8146 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/gpu/drm/i915/i915_gem_timeline.c linux-4.14/drivers/gpu/drm/i915/i915_gem_timeline.c
8147 --- linux-4.14.orig/drivers/gpu/drm/i915/i915_gem_timeline.c 2017-11-12 19:46:13.000000000 +0100
8148 +++ linux-4.14/drivers/gpu/drm/i915/i915_gem_timeline.c 2018-09-05 11:05:07.000000000 +0200
8151 tl->fence_context = context;
8152 tl->common = parent;
8153 -#ifdef CONFIG_DEBUG_SPINLOCK
8154 - __raw_spin_lock_init(&tl->lock.rlock, lockname, lockclass);
8156 spin_lock_init(&tl->lock);
8158 + lockdep_set_class_and_name(&tl->lock, lockclass, lockname);
8159 init_request_active(&tl->last_request, NULL);
8160 INIT_LIST_HEAD(&tl->requests);
8161 i915_syncmap_init(&tl->sync);
8162 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/gpu/drm/i915/i915_irq.c linux-4.14/drivers/gpu/drm/i915/i915_irq.c
8163 --- linux-4.14.orig/drivers/gpu/drm/i915/i915_irq.c 2018-09-05 11:03:21.000000000 +0200
8164 +++ linux-4.14/drivers/gpu/drm/i915/i915_irq.c 2018-09-05 11:05:07.000000000 +0200
8166 spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
8168 /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8169 + preempt_disable_rt();
8171 /* Get optional system timestamp before query. */
8174 *etime = ktime_get();
8176 /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8177 + preempt_enable_rt();
8179 spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
8181 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/gpu/drm/i915/intel_sprite.c linux-4.14/drivers/gpu/drm/i915/intel_sprite.c
8182 --- linux-4.14.orig/drivers/gpu/drm/i915/intel_sprite.c 2018-09-05 11:03:21.000000000 +0200
8183 +++ linux-4.14/drivers/gpu/drm/i915/intel_sprite.c 2018-09-05 11:05:07.000000000 +0200
8185 #include <drm/drm_rect.h>
8186 #include <drm/drm_atomic.h>
8187 #include <drm/drm_plane_helper.h>
8188 +#include <linux/locallock.h>
8189 #include "intel_drv.h"
8190 #include "intel_frontbuffer.h"
8191 #include <drm/i915_drm.h>
8195 #define VBLANK_EVASION_TIME_US 100
8197 +static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
8199 * intel_pipe_update_start() - start update of a set of display registers
8200 * @crtc: the crtc of which the registers are going to be updated
8202 VBLANK_EVASION_TIME_US);
8203 max = vblank_start - 1;
8205 - local_irq_disable();
8206 + local_lock_irq(pipe_update_lock);
8208 if (min <= 0 || max <= 0)
8210 @@ -132,11 +133,11 @@
8214 - local_irq_enable();
8215 + local_unlock_irq(pipe_update_lock);
8217 timeout = schedule_timeout(timeout);
8219 - local_irq_disable();
8220 + local_lock_irq(pipe_update_lock);
8223 finish_wait(wq, &wait);
8225 crtc->base.state->event = NULL;
8228 - local_irq_enable();
8229 + local_unlock_irq(pipe_update_lock);
8231 if (intel_vgpu_active(dev_priv))
8233 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/gpu/drm/radeon/radeon_display.c linux-4.14/drivers/gpu/drm/radeon/radeon_display.c
8234 --- linux-4.14.orig/drivers/gpu/drm/radeon/radeon_display.c 2017-11-12 19:46:13.000000000 +0100
8235 +++ linux-4.14/drivers/gpu/drm/radeon/radeon_display.c 2018-09-05 11:05:07.000000000 +0200
8236 @@ -1839,6 +1839,7 @@
8237 struct radeon_device *rdev = dev->dev_private;
8239 /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8240 + preempt_disable_rt();
8242 /* Get optional system timestamp before query. */
8244 @@ -1931,6 +1932,7 @@
8245 *etime = ktime_get();
8247 /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8248 + preempt_enable_rt();
8250 /* Decode into vertical and horizontal scanout position. */
8251 *vpos = position & 0x1fff;
8252 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/hv/vmbus_drv.c linux-4.14/drivers/hv/vmbus_drv.c
8253 --- linux-4.14.orig/drivers/hv/vmbus_drv.c 2018-09-05 11:03:21.000000000 +0200
8254 +++ linux-4.14/drivers/hv/vmbus_drv.c 2018-09-05 11:05:37.000000000 +0200
8256 #include <asm/hyperv.h>
8257 #include <asm/hypervisor.h>
8258 #include <asm/mshyperv.h>
8259 +#include <asm/irq_regs.h>
8260 #include <linux/notifier.h>
8261 #include <linux/ptrace.h>
8262 #include <linux/screen_info.h>
8264 void *page_addr = hv_cpu->synic_event_page;
8265 struct hv_message *msg;
8266 union hv_synic_event_flags *event;
8267 + struct pt_regs *regs = get_irq_regs();
8268 + u64 ip = regs ? instruction_pointer(regs) : 0;
8269 bool handled = false;
8271 if (unlikely(page_addr == NULL))
8272 @@ -1009,7 +1012,7 @@
8273 tasklet_schedule(&hv_cpu->msg_dpc);
8276 - add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
8277 + add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, ip);
8281 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/alim15x3.c linux-4.14/drivers/ide/alim15x3.c
8282 --- linux-4.14.orig/drivers/ide/alim15x3.c 2017-11-12 19:46:13.000000000 +0100
8283 +++ linux-4.14/drivers/ide/alim15x3.c 2018-09-05 11:05:07.000000000 +0200
8286 isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
8288 - local_irq_save(flags);
8289 + local_irq_save_nort(flags);
8291 if (m5229_revision < 0xC2) {
8296 pci_dev_put(isa_dev);
8297 - local_irq_restore(flags);
8298 + local_irq_restore_nort(flags);
8302 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/hpt366.c linux-4.14/drivers/ide/hpt366.c
8303 --- linux-4.14.orig/drivers/ide/hpt366.c 2017-11-12 19:46:13.000000000 +0100
8304 +++ linux-4.14/drivers/ide/hpt366.c 2018-09-05 11:05:07.000000000 +0200
8305 @@ -1236,7 +1236,7 @@
8307 dma_old = inb(base + 2);
8309 - local_irq_save(flags);
8310 + local_irq_save_nort(flags);
8313 pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
8314 @@ -1247,7 +1247,7 @@
8315 if (dma_new != dma_old)
8316 outb(dma_new, base + 2);
8318 - local_irq_restore(flags);
8319 + local_irq_restore_nort(flags);
8321 printk(KERN_INFO " %s: BM-DMA at 0x%04lx-0x%04lx\n",
8322 hwif->name, base, base + 7);
8323 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/ide-io.c linux-4.14/drivers/ide/ide-io.c
8324 --- linux-4.14.orig/drivers/ide/ide-io.c 2017-11-12 19:46:13.000000000 +0100
8325 +++ linux-4.14/drivers/ide/ide-io.c 2018-09-05 11:05:07.000000000 +0200
8327 /* disable_irq_nosync ?? */
8328 disable_irq(hwif->irq);
8329 /* local CPU only, as if we were handling an interrupt */
8330 - local_irq_disable();
8331 + local_irq_disable_nort();
8332 if (hwif->polling) {
8333 startstop = handler(drive);
8334 } else if (drive_is_ready(drive)) {
8335 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/ide-iops.c linux-4.14/drivers/ide/ide-iops.c
8336 --- linux-4.14.orig/drivers/ide/ide-iops.c 2017-11-12 19:46:13.000000000 +0100
8337 +++ linux-4.14/drivers/ide/ide-iops.c 2018-09-05 11:05:07.000000000 +0200
8338 @@ -129,12 +129,12 @@
8339 if ((stat & ATA_BUSY) == 0)
8342 - local_irq_restore(flags);
8343 + local_irq_restore_nort(flags);
8348 - local_irq_restore(flags);
8349 + local_irq_restore_nort(flags);
8352 * Allow status to settle, then read it again.
8353 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/ide-io-std.c linux-4.14/drivers/ide/ide-io-std.c
8354 --- linux-4.14.orig/drivers/ide/ide-io-std.c 2017-11-12 19:46:13.000000000 +0100
8355 +++ linux-4.14/drivers/ide/ide-io-std.c 2018-09-05 11:05:07.000000000 +0200
8357 unsigned long uninitialized_var(flags);
8359 if ((io_32bit & 2) && !mmio) {
8360 - local_irq_save(flags);
8361 + local_irq_save_nort(flags);
8362 ata_vlb_sync(io_ports->nsect_addr);
8366 insl(data_addr, buf, words);
8368 if ((io_32bit & 2) && !mmio)
8369 - local_irq_restore(flags);
8370 + local_irq_restore_nort(flags);
8372 if (((len + 1) & 3) < 2)
8375 unsigned long uninitialized_var(flags);
8377 if ((io_32bit & 2) && !mmio) {
8378 - local_irq_save(flags);
8379 + local_irq_save_nort(flags);
8380 ata_vlb_sync(io_ports->nsect_addr);
8384 outsl(data_addr, buf, words);
8386 if ((io_32bit & 2) && !mmio)
8387 - local_irq_restore(flags);
8388 + local_irq_restore_nort(flags);
8390 if (((len + 1) & 3) < 2)
8392 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/ide-probe.c linux-4.14/drivers/ide/ide-probe.c
8393 --- linux-4.14.orig/drivers/ide/ide-probe.c 2017-11-12 19:46:13.000000000 +0100
8394 +++ linux-4.14/drivers/ide/ide-probe.c 2018-09-05 11:05:07.000000000 +0200
8395 @@ -196,10 +196,10 @@
8398 /* local CPU only; some systems need this */
8399 - local_irq_save(flags);
8400 + local_irq_save_nort(flags);
8401 /* read 512 bytes of id info */
8402 hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
8403 - local_irq_restore(flags);
8404 + local_irq_restore_nort(flags);
8406 drive->dev_flags |= IDE_DFLAG_ID_READ;
8408 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/ide-taskfile.c linux-4.14/drivers/ide/ide-taskfile.c
8409 --- linux-4.14.orig/drivers/ide/ide-taskfile.c 2017-11-12 19:46:13.000000000 +0100
8410 +++ linux-4.14/drivers/ide/ide-taskfile.c 2018-09-05 11:05:07.000000000 +0200
8413 page_is_high = PageHighMem(page);
8415 - local_irq_save(flags);
8416 + local_irq_save_nort(flags);
8418 buf = kmap_atomic(page) + offset;
8424 - local_irq_restore(flags);
8425 + local_irq_restore_nort(flags);
8432 if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
8433 - local_irq_disable();
8434 + local_irq_disable_nort();
8436 ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
8438 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/infiniband/hw/hfi1/affinity.c linux-4.14/drivers/infiniband/hw/hfi1/affinity.c
8439 --- linux-4.14.orig/drivers/infiniband/hw/hfi1/affinity.c 2018-09-05 11:03:22.000000000 +0200
8440 +++ linux-4.14/drivers/infiniband/hw/hfi1/affinity.c 2018-09-05 11:05:07.000000000 +0200
8442 struct hfi1_affinity_node *entry;
8443 cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
8444 const struct cpumask *node_mask,
8445 - *proc_mask = ¤t->cpus_allowed;
8446 + *proc_mask = current->cpus_ptr;
8447 struct hfi1_affinity_node_list *affinity = &node_affinity;
8448 struct cpu_mask_set *set = &affinity->proc;
8451 * check whether process/context affinity has already
8454 - if (cpumask_weight(proc_mask) == 1) {
8455 + if (current->nr_cpus_allowed == 1) {
8456 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
8457 current->pid, current->comm,
8458 cpumask_pr_args(proc_mask));
8460 cpu = cpumask_first(proc_mask);
8461 cpumask_set_cpu(cpu, &set->used);
8463 - } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
8464 + } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
8465 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
8466 current->pid, current->comm,
8467 cpumask_pr_args(proc_mask));
8468 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/infiniband/hw/hfi1/sdma.c linux-4.14/drivers/infiniband/hw/hfi1/sdma.c
8469 --- linux-4.14.orig/drivers/infiniband/hw/hfi1/sdma.c 2017-11-12 19:46:13.000000000 +0100
8470 +++ linux-4.14/drivers/infiniband/hw/hfi1/sdma.c 2018-09-05 11:05:07.000000000 +0200
8471 @@ -856,14 +856,13 @@
8473 struct sdma_rht_node *rht_node;
8474 struct sdma_engine *sde = NULL;
8475 - const struct cpumask *current_mask = ¤t->cpus_allowed;
8476 unsigned long cpu_id;
8479 * To ensure that always the same sdma engine(s) will be
8480 * selected make sure the process is pinned to this CPU only.
8482 - if (cpumask_weight(current_mask) != 1)
8483 + if (current->nr_cpus_allowed != 1)
8486 cpu_id = smp_processor_id();
8487 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/infiniband/hw/qib/qib_file_ops.c linux-4.14/drivers/infiniband/hw/qib/qib_file_ops.c
8488 --- linux-4.14.orig/drivers/infiniband/hw/qib/qib_file_ops.c 2018-09-05 11:03:22.000000000 +0200
8489 +++ linux-4.14/drivers/infiniband/hw/qib/qib_file_ops.c 2018-09-05 11:05:07.000000000 +0200
8490 @@ -1167,7 +1167,7 @@
8491 static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd)
8493 struct qib_filedata *fd = fp->private_data;
8494 - const unsigned int weight = cpumask_weight(¤t->cpus_allowed);
8495 + const unsigned int weight = current->nr_cpus_allowed;
8496 const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus);
8499 @@ -1648,9 +1648,8 @@
8500 ret = find_free_ctxt(i_minor - 1, fp, uinfo);
8503 - const unsigned int cpu = cpumask_first(¤t->cpus_allowed);
8504 - const unsigned int weight =
8505 - cpumask_weight(¤t->cpus_allowed);
8506 + const unsigned int cpu = cpumask_first(current->cpus_ptr);
8507 + const unsigned int weight = current->nr_cpus_allowed;
8509 if (weight == 1 && !test_bit(cpu, qib_cpulist))
8510 if (!find_hca(cpu, &unit) && unit >= 0)
8511 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c linux-4.14/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
8512 --- linux-4.14.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2018-09-05 11:03:22.000000000 +0200
8513 +++ linux-4.14/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2018-09-05 11:05:07.000000000 +0200
8516 ipoib_dbg_mcast(priv, "restarting multicast task\n");
8518 - local_irq_save(flags);
8519 + local_irq_save_nort(flags);
8520 netif_addr_lock(dev);
8521 spin_lock(&priv->lock);
8525 spin_unlock(&priv->lock);
8526 netif_addr_unlock(dev);
8527 - local_irq_restore(flags);
8528 + local_irq_restore_nort(flags);
8530 ipoib_mcast_remove_list(&remove_list);
8532 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/input/gameport/gameport.c linux-4.14/drivers/input/gameport/gameport.c
8533 --- linux-4.14.orig/drivers/input/gameport/gameport.c 2017-11-12 19:46:13.000000000 +0100
8534 +++ linux-4.14/drivers/input/gameport/gameport.c 2018-09-05 11:05:07.000000000 +0200
8538 for (i = 0; i < 50; i++) {
8539 - local_irq_save(flags);
8540 + local_irq_save_nort(flags);
8541 t1 = ktime_get_ns();
8542 for (t = 0; t < 50; t++)
8543 gameport_read(gameport);
8544 t2 = ktime_get_ns();
8545 t3 = ktime_get_ns();
8546 - local_irq_restore(flags);
8547 + local_irq_restore_nort(flags);
8549 t = (t2 - t1) - (t3 - t2);
8551 @@ -124,12 +124,12 @@
8554 for(i = 0; i < 50; i++) {
8555 - local_irq_save(flags);
8556 + local_irq_save_nort(flags);
8558 for (t = 0; t < 50; t++) gameport_read(gameport);
8561 - local_irq_restore(flags);
8562 + local_irq_restore_nort(flags);
8564 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
8566 @@ -148,11 +148,11 @@
8569 for(i = 0; i < 50; i++) {
8570 - local_irq_save(flags);
8571 + local_irq_save_nort(flags);
8573 for (t = 0; t < 50; t++) gameport_read(gameport);
8575 - local_irq_restore(flags);
8576 + local_irq_restore_nort(flags);
8578 if (t2 - t1 < tx) tx = t2 - t1;
8580 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/iommu/amd_iommu.c linux-4.14/drivers/iommu/amd_iommu.c
8581 --- linux-4.14.orig/drivers/iommu/amd_iommu.c 2018-09-05 11:03:22.000000000 +0200
8582 +++ linux-4.14/drivers/iommu/amd_iommu.c 2018-09-05 11:05:07.000000000 +0200
8585 #define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38))
8587 -static DEFINE_RWLOCK(amd_iommu_devtable_lock);
8588 +static DEFINE_SPINLOCK(amd_iommu_devtable_lock);
8589 +static DEFINE_SPINLOCK(pd_bitmap_lock);
8590 +static DEFINE_SPINLOCK(iommu_table_lock);
8592 /* List of all available dev_data structures */
8593 -static LIST_HEAD(dev_data_list);
8594 -static DEFINE_SPINLOCK(dev_data_list_lock);
8595 +static LLIST_HEAD(dev_data_list);
8597 LIST_HEAD(ioapic_map);
8598 LIST_HEAD(hpet_map);
8599 @@ -204,40 +205,33 @@
8600 static struct iommu_dev_data *alloc_dev_data(u16 devid)
8602 struct iommu_dev_data *dev_data;
8603 - unsigned long flags;
8605 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
8609 dev_data->devid = devid;
8611 - spin_lock_irqsave(&dev_data_list_lock, flags);
8612 - list_add_tail(&dev_data->dev_data_list, &dev_data_list);
8613 - spin_unlock_irqrestore(&dev_data_list_lock, flags);
8615 ratelimit_default_init(&dev_data->rs);
8617 + llist_add(&dev_data->dev_data_list, &dev_data_list);
8621 static struct iommu_dev_data *search_dev_data(u16 devid)
8623 struct iommu_dev_data *dev_data;
8624 - unsigned long flags;
8625 + struct llist_node *node;
8627 - spin_lock_irqsave(&dev_data_list_lock, flags);
8628 - list_for_each_entry(dev_data, &dev_data_list, dev_data_list) {
8629 + if (llist_empty(&dev_data_list))
8632 + node = dev_data_list.first;
8633 + llist_for_each_entry(dev_data, node, dev_data_list) {
8634 if (dev_data->devid == devid)
8642 - spin_unlock_irqrestore(&dev_data_list_lock, flags);
8648 static int __last_alias(struct pci_dev *pdev, u16 alias, void *data)
8649 @@ -1056,9 +1050,9 @@
8650 unsigned long flags;
8653 - spin_lock_irqsave(&iommu->lock, flags);
8654 + raw_spin_lock_irqsave(&iommu->lock, flags);
8655 ret = __iommu_queue_command_sync(iommu, cmd, sync);
8656 - spin_unlock_irqrestore(&iommu->lock, flags);
8657 + raw_spin_unlock_irqrestore(&iommu->lock, flags);
8661 @@ -1084,7 +1078,7 @@
8663 build_completion_wait(&cmd, (u64)&iommu->cmd_sem);
8665 - spin_lock_irqsave(&iommu->lock, flags);
8666 + raw_spin_lock_irqsave(&iommu->lock, flags);
8670 @@ -1095,7 +1089,7 @@
8671 ret = wait_on_sem(&iommu->cmd_sem);
8674 - spin_unlock_irqrestore(&iommu->lock, flags);
8675 + raw_spin_unlock_irqrestore(&iommu->lock, flags);
8679 @@ -1604,29 +1598,26 @@
8681 static u16 domain_id_alloc(void)
8683 - unsigned long flags;
8686 - write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8687 + spin_lock(&pd_bitmap_lock);
8688 id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
8690 if (id > 0 && id < MAX_DOMAIN_ID)
8691 __set_bit(id, amd_iommu_pd_alloc_bitmap);
8694 - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8695 + spin_unlock(&pd_bitmap_lock);
8700 static void domain_id_free(int id)
8702 - unsigned long flags;
8704 - write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8705 + spin_lock(&pd_bitmap_lock);
8706 if (id > 0 && id < MAX_DOMAIN_ID)
8707 __clear_bit(id, amd_iommu_pd_alloc_bitmap);
8708 - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8709 + spin_unlock(&pd_bitmap_lock);
8712 #define DEFINE_FREE_PT_FN(LVL, FN) \
8713 @@ -1946,10 +1937,10 @@
8717 - * Must be called with IRQs disabled. Warn here to detect early
8719 + * Must be called with IRQs disabled on a non RT kernel. Warn here to
8720 + * detect early when its not.
8722 - WARN_ON(!irqs_disabled());
8723 + WARN_ON_NONRT(!irqs_disabled());
8726 spin_lock(&domain->lock);
8727 @@ -2095,9 +2086,9 @@
8731 - write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8732 + spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
8733 ret = __attach_device(dev_data, domain);
8734 - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8735 + spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8738 * We might boot into a crash-kernel here. The crashed kernel
8739 @@ -2117,10 +2108,10 @@
8740 struct protection_domain *domain;
8743 - * Must be called with IRQs disabled. Warn here to detect early
8745 + * Must be called with IRQs disabled on a non RT kernel. Warn here to
8746 + * detect early when its not.
8748 - WARN_ON(!irqs_disabled());
8749 + WARN_ON_NONRT(!irqs_disabled());
8751 if (WARN_ON(!dev_data->domain))
8753 @@ -2147,9 +2138,9 @@
8754 domain = dev_data->domain;
8756 /* lock device table */
8757 - write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8758 + spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
8759 __detach_device(dev_data);
8760 - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8761 + spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8763 if (!dev_is_pci(dev))
8765 @@ -2813,7 +2804,7 @@
8766 struct iommu_dev_data *entry;
8767 unsigned long flags;
8769 - write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8770 + spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
8772 while (!list_empty(&domain->dev_list)) {
8773 entry = list_first_entry(&domain->dev_list,
8774 @@ -2821,7 +2812,7 @@
8775 __detach_device(entry);
8778 - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8779 + spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8782 static void protection_domain_free(struct protection_domain *domain)
8783 @@ -3588,14 +3579,62 @@
8784 amd_iommu_dev_table[devid].data[2] = dte;
8787 -static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic)
8788 +static struct irq_remap_table *get_irq_table(u16 devid)
8790 + struct irq_remap_table *table;
8792 + if (WARN_ONCE(!amd_iommu_rlookup_table[devid],
8793 + "%s: no iommu for devid %x\n", __func__, devid))
8796 + table = irq_lookup_table[devid];
8797 + if (WARN_ONCE(!table, "%s: no table for devid %x\n", __func__, devid))
8803 +static struct irq_remap_table *__alloc_irq_table(void)
8805 + struct irq_remap_table *table;
8807 + table = kzalloc(sizeof(*table), GFP_KERNEL);
8811 + table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL);
8812 + if (!table->table) {
8816 + raw_spin_lock_init(&table->lock);
8818 + if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
8819 + memset(table->table, 0,
8820 + MAX_IRQS_PER_TABLE * sizeof(u32));
8822 + memset(table->table, 0,
8823 + (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
8827 +static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid,
8828 + struct irq_remap_table *table)
8830 + irq_lookup_table[devid] = table;
8831 + set_dte_irq_entry(devid, table);
8832 + iommu_flush_dte(iommu, devid);
8835 +static struct irq_remap_table *alloc_irq_table(u16 devid)
8837 struct irq_remap_table *table = NULL;
8838 + struct irq_remap_table *new_table = NULL;
8839 struct amd_iommu *iommu;
8840 unsigned long flags;
8843 - write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8844 + spin_lock_irqsave(&iommu_table_lock, flags);
8846 iommu = amd_iommu_rlookup_table[devid];
8848 @@ -3608,60 +3647,45 @@
8849 alias = amd_iommu_alias_table[devid];
8850 table = irq_lookup_table[alias];
8852 - irq_lookup_table[devid] = table;
8853 - set_dte_irq_entry(devid, table);
8854 - iommu_flush_dte(iommu, devid);
8856 + set_remap_table_entry(iommu, devid, table);
8859 + spin_unlock_irqrestore(&iommu_table_lock, flags);
8861 /* Nothing there yet, allocate new irq remapping table */
8862 - table = kzalloc(sizeof(*table), GFP_ATOMIC);
8866 - /* Initialize table spin-lock */
8867 - spin_lock_init(&table->lock);
8868 + new_table = __alloc_irq_table();
8873 - /* Keep the first 32 indexes free for IOAPIC interrupts */
8874 - table->min_index = 32;
8875 + spin_lock_irqsave(&iommu_table_lock, flags);
8877 - table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_ATOMIC);
8878 - if (!table->table) {
8881 + table = irq_lookup_table[devid];
8886 - if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
8887 - memset(table->table, 0,
8888 - MAX_IRQS_PER_TABLE * sizeof(u32));
8890 - memset(table->table, 0,
8891 - (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
8896 - for (i = 0; i < 32; ++i)
8897 - iommu->irte_ops->set_allocated(table, i);
8898 + table = irq_lookup_table[alias];
8900 + set_remap_table_entry(iommu, devid, table);
8904 - irq_lookup_table[devid] = table;
8905 - set_dte_irq_entry(devid, table);
8906 - iommu_flush_dte(iommu, devid);
8907 - if (devid != alias) {
8908 - irq_lookup_table[alias] = table;
8909 - set_dte_irq_entry(alias, table);
8910 - iommu_flush_dte(iommu, alias);
8912 + table = new_table;
8916 + set_remap_table_entry(iommu, devid, table);
8917 + if (devid != alias)
8918 + set_remap_table_entry(iommu, alias, table);
8921 iommu_completion_wait(iommu);
8924 - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8925 + spin_unlock_irqrestore(&iommu_table_lock, flags);
8928 + kmem_cache_free(amd_iommu_irq_cache, new_table->table);
8934 @@ -3675,11 +3699,11 @@
8938 - table = get_irq_table(devid, false);
8939 + table = alloc_irq_table(devid);
8943 - spin_lock_irqsave(&table->lock, flags);
8944 + raw_spin_lock_irqsave(&table->lock, flags);
8946 /* Scan table for free entries */
8947 for (c = 0, index = table->min_index;
8948 @@ -3702,7 +3726,7 @@
8952 - spin_unlock_irqrestore(&table->lock, flags);
8953 + raw_spin_unlock_irqrestore(&table->lock, flags);
8957 @@ -3719,11 +3743,11 @@
8961 - table = get_irq_table(devid, false);
8962 + table = get_irq_table(devid);
8966 - spin_lock_irqsave(&table->lock, flags);
8967 + raw_spin_lock_irqsave(&table->lock, flags);
8969 entry = (struct irte_ga *)table->table;
8970 entry = &entry[index];
8971 @@ -3734,7 +3758,7 @@
8975 - spin_unlock_irqrestore(&table->lock, flags);
8976 + raw_spin_unlock_irqrestore(&table->lock, flags);
8978 iommu_flush_irt(iommu, devid);
8979 iommu_completion_wait(iommu);
8980 @@ -3752,13 +3776,13 @@
8984 - table = get_irq_table(devid, false);
8985 + table = get_irq_table(devid);
8989 - spin_lock_irqsave(&table->lock, flags);
8990 + raw_spin_lock_irqsave(&table->lock, flags);
8991 table->table[index] = irte->val;
8992 - spin_unlock_irqrestore(&table->lock, flags);
8993 + raw_spin_unlock_irqrestore(&table->lock, flags);
8995 iommu_flush_irt(iommu, devid);
8996 iommu_completion_wait(iommu);
8997 @@ -3776,13 +3800,13 @@
9001 - table = get_irq_table(devid, false);
9002 + table = get_irq_table(devid);
9006 - spin_lock_irqsave(&table->lock, flags);
9007 + raw_spin_lock_irqsave(&table->lock, flags);
9008 iommu->irte_ops->clear_allocated(table, index);
9009 - spin_unlock_irqrestore(&table->lock, flags);
9010 + raw_spin_unlock_irqrestore(&table->lock, flags);
9012 iommu_flush_irt(iommu, devid);
9013 iommu_completion_wait(iommu);
9014 @@ -3863,10 +3887,8 @@
9015 u8 vector, u32 dest_apicid)
9017 struct irte_ga *irte = (struct irte_ga *) entry;
9018 - struct iommu_dev_data *dev_data = search_dev_data(devid);
9020 - if (!dev_data || !dev_data->use_vapic ||
9021 - !irte->lo.fields_remap.guest_mode) {
9022 + if (!irte->lo.fields_remap.guest_mode) {
9023 irte->hi.fields.vector = vector;
9024 irte->lo.fields_remap.destination = dest_apicid;
9025 modify_irte_ga(devid, index, irte, NULL);
9026 @@ -4072,7 +4094,7 @@
9027 struct amd_ir_data *data = NULL;
9028 struct irq_cfg *cfg;
9035 @@ -4096,10 +4118,26 @@
9038 if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
9039 - if (get_irq_table(devid, true))
9040 + struct irq_remap_table *table;
9041 + struct amd_iommu *iommu;
9043 + table = alloc_irq_table(devid);
9045 + if (!table->min_index) {
9047 + * Keep the first 32 indexes free for IOAPIC
9050 + table->min_index = 32;
9051 + iommu = amd_iommu_rlookup_table[devid];
9052 + for (i = 0; i < 32; ++i)
9053 + iommu->irte_ops->set_allocated(table, i);
9055 + WARN_ON(table->min_index != 32);
9056 index = info->ioapic_pin;
9063 index = alloc_irq_index(devid, nr_irqs);
9065 @@ -4343,7 +4381,7 @@
9067 unsigned long flags;
9068 struct amd_iommu *iommu;
9069 - struct irq_remap_table *irt;
9070 + struct irq_remap_table *table;
9071 struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
9072 int devid = ir_data->irq_2_irte.devid;
9073 struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
9074 @@ -4357,11 +4395,11 @@
9078 - irt = get_irq_table(devid, false);
9080 + table = get_irq_table(devid);
9084 - spin_lock_irqsave(&irt->lock, flags);
9085 + raw_spin_lock_irqsave(&table->lock, flags);
9087 if (ref->lo.fields_vapic.guest_mode) {
9089 @@ -4370,7 +4408,7 @@
9093 - spin_unlock_irqrestore(&irt->lock, flags);
9094 + raw_spin_unlock_irqrestore(&table->lock, flags);
9096 iommu_flush_irt(iommu, devid);
9097 iommu_completion_wait(iommu);
9098 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/iommu/amd_iommu_init.c linux-4.14/drivers/iommu/amd_iommu_init.c
9099 --- linux-4.14.orig/drivers/iommu/amd_iommu_init.c 2017-11-12 19:46:13.000000000 +0100
9100 +++ linux-4.14/drivers/iommu/amd_iommu_init.c 2018-09-05 11:05:07.000000000 +0200
9101 @@ -1474,7 +1474,7 @@
9105 - spin_lock_init(&iommu->lock);
9106 + raw_spin_lock_init(&iommu->lock);
9108 /* Add IOMMU to internal data structures */
9109 list_add_tail(&iommu->list, &amd_iommu_list);
9110 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/iommu/amd_iommu_types.h linux-4.14/drivers/iommu/amd_iommu_types.h
9111 --- linux-4.14.orig/drivers/iommu/amd_iommu_types.h 2017-11-12 19:46:13.000000000 +0100
9112 +++ linux-4.14/drivers/iommu/amd_iommu_types.h 2018-09-05 11:05:07.000000000 +0200
9114 #define IRQ_TABLE_ALIGNMENT 128
9116 struct irq_remap_table {
9118 + raw_spinlock_t lock;
9125 /* locks the accesses to the hardware */
9127 + raw_spinlock_t lock;
9129 /* Pointer to PCI device of this IOMMU */
9130 struct pci_dev *dev;
9133 struct iommu_dev_data {
9134 struct list_head list; /* For domain->dev_list */
9135 - struct list_head dev_data_list; /* For global dev_data_list */
9136 + struct llist_node dev_data_list; /* For global dev_data_list */
9137 struct protection_domain *domain; /* Domain the device is bound to */
9138 u16 devid; /* PCI Device ID */
9139 u16 alias; /* Alias Device ID */
9140 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/iommu/iova.c linux-4.14/drivers/iommu/iova.c
9141 --- linux-4.14.orig/drivers/iommu/iova.c 2017-11-12 19:46:13.000000000 +0100
9142 +++ linux-4.14/drivers/iommu/iova.c 2018-09-05 11:05:07.000000000 +0200
9144 unsigned long pfn, unsigned long pages,
9147 - struct iova_fq *fq = get_cpu_ptr(iovad->fq);
9148 + struct iova_fq *fq = raw_cpu_ptr(iovad->fq);
9149 unsigned long flags;
9153 if (atomic_cmpxchg(&iovad->fq_timer_on, 0, 1) == 0)
9154 mod_timer(&iovad->fq_timer,
9155 jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
9157 - put_cpu_ptr(iovad->fq);
9159 EXPORT_SYMBOL_GPL(queue_iova);
9161 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/leds/trigger/Kconfig linux-4.14/drivers/leds/trigger/Kconfig
9162 --- linux-4.14.orig/drivers/leds/trigger/Kconfig 2017-11-12 19:46:13.000000000 +0100
9163 +++ linux-4.14/drivers/leds/trigger/Kconfig 2018-09-05 11:05:07.000000000 +0200
9166 config LEDS_TRIGGER_CPU
9167 bool "LED CPU Trigger"
9168 - depends on LEDS_TRIGGERS
9169 + depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
9171 This allows LEDs to be controlled by active CPUs. This shows
9172 the active CPUs across an array of LEDs so you can see which
9173 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/md/bcache/Kconfig linux-4.14/drivers/md/bcache/Kconfig
9174 --- linux-4.14.orig/drivers/md/bcache/Kconfig 2017-11-12 19:46:13.000000000 +0100
9175 +++ linux-4.14/drivers/md/bcache/Kconfig 2018-09-05 11:05:07.000000000 +0200
9179 tristate "Block device as cache"
9180 + depends on !PREEMPT_RT_FULL
9182 Allows a block device to be used as cache for other devices; uses
9183 a btree for indexing and the layout is optimized for SSDs.
9184 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/md/dm-rq.c linux-4.14/drivers/md/dm-rq.c
9185 --- linux-4.14.orig/drivers/md/dm-rq.c 2017-11-12 19:46:13.000000000 +0100
9186 +++ linux-4.14/drivers/md/dm-rq.c 2018-09-05 11:05:07.000000000 +0200
9188 /* Establish tio->ti before queuing work (map_tio_request) */
9190 kthread_queue_work(&md->kworker, &tio->work);
9191 - BUG_ON(!irqs_disabled());
9192 + BUG_ON_NONRT(!irqs_disabled());
9196 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/md/raid5.c linux-4.14/drivers/md/raid5.c
9197 --- linux-4.14.orig/drivers/md/raid5.c 2018-09-05 11:03:22.000000000 +0200
9198 +++ linux-4.14/drivers/md/raid5.c 2018-09-05 11:05:07.000000000 +0200
9200 md_wakeup_thread(conf->mddev->thread);
9203 - local_irq_save(flags);
9204 + local_irq_save_nort(flags);
9205 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
9206 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
9207 INIT_LIST_HEAD(&list);
9209 spin_unlock(&conf->device_lock);
9210 release_inactive_stripe_list(conf, &list, hash);
9212 - local_irq_restore(flags);
9213 + local_irq_restore_nort(flags);
9216 static inline void remove_hash(struct stripe_head *sh)
9217 @@ -2067,8 +2067,9 @@
9218 struct raid5_percpu *percpu;
9222 + cpu = get_cpu_light();
9223 percpu = per_cpu_ptr(conf->percpu, cpu);
9224 + spin_lock(&percpu->lock);
9225 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
9226 ops_run_biofill(sh);
9228 @@ -2127,7 +2128,8 @@
9229 if (test_and_clear_bit(R5_Overlap, &dev->flags))
9230 wake_up(&sh->raid_conf->wait_for_overlap);
9233 + spin_unlock(&percpu->lock);
9237 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
9238 @@ -6775,6 +6777,7 @@
9242 + spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
9246 @@ -6785,7 +6788,6 @@
9247 conf->percpu = alloc_percpu(struct raid5_percpu);
9251 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
9253 conf->scribble_disks = max(conf->raid_disks,
9254 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/md/raid5.h linux-4.14/drivers/md/raid5.h
9255 --- linux-4.14.orig/drivers/md/raid5.h 2017-11-12 19:46:13.000000000 +0100
9256 +++ linux-4.14/drivers/md/raid5.h 2018-09-05 11:05:07.000000000 +0200
9258 int recovery_disabled;
9259 /* per cpu variables */
9260 struct raid5_percpu {
9261 + spinlock_t lock; /* Protection for -RT */
9262 struct page *spare_page; /* Used when checking P/Q in raid6 */
9263 struct flex_array *scribble; /* space for constructing buffer
9264 * lists and performing address
9265 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/mfd/atmel-smc.c linux-4.14/drivers/mfd/atmel-smc.c
9266 --- linux-4.14.orig/drivers/mfd/atmel-smc.c 2017-11-12 19:46:13.000000000 +0100
9267 +++ linux-4.14/drivers/mfd/atmel-smc.c 2018-09-05 11:05:07.000000000 +0200
9271 #include <linux/mfd/syscon/atmel-smc.h>
9272 +#include <linux/string.h>
9275 * atmel_smc_cs_conf_init - initialize a SMC CS conf
9276 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/misc/Kconfig linux-4.14/drivers/misc/Kconfig
9277 --- linux-4.14.orig/drivers/misc/Kconfig 2017-11-12 19:46:13.000000000 +0100
9278 +++ linux-4.14/drivers/misc/Kconfig 2018-09-05 11:05:07.000000000 +0200
9281 bool "Atmel AT32/AT91 Timer/Counter Library"
9282 depends on (AVR32 || ARCH_AT91)
9283 + default y if PREEMPT_RT_FULL
9285 Select this if you want a library to allocate the Timer/Counter
9286 blocks found on many Atmel processors. This facilitates using
9288 are combined to make a single 32-bit timer.
9290 When GENERIC_CLOCKEVENTS is defined, the third timer channel
9291 - may be used as a clock event device supporting oneshot mode
9292 - (delays of up to two seconds) based on the 32 KiHz clock.
9293 + may be used as a clock event device supporting oneshot mode.
9295 config ATMEL_TCB_CLKSRC_BLOCK
9298 TC can be used for other purposes, such as PWM generation and
9301 +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
9302 + bool "TC Block use 32 KiHz clock"
9303 + depends on ATMEL_TCB_CLKSRC
9304 + default y if !PREEMPT_RT_FULL
9306 + Select this to use 32 KiHz base clock rate as TC block clock
9307 + source for clock events.
9311 tristate "Dummy IRQ handler"
9313 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/mmc/host/mmci.c linux-4.14/drivers/mmc/host/mmci.c
9314 --- linux-4.14.orig/drivers/mmc/host/mmci.c 2017-11-12 19:46:13.000000000 +0100
9315 +++ linux-4.14/drivers/mmc/host/mmci.c 2018-09-05 11:05:07.000000000 +0200
9316 @@ -1200,15 +1200,12 @@
9317 struct sg_mapping_iter *sg_miter = &host->sg_miter;
9318 struct variant_data *variant = host->variant;
9319 void __iomem *base = host->base;
9320 - unsigned long flags;
9323 status = readl(base + MMCISTATUS);
9325 dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
9327 - local_irq_save(flags);
9330 unsigned int remain, len;
9332 @@ -1248,8 +1245,6 @@
9334 sg_miter_stop(sg_miter);
9336 - local_irq_restore(flags);
9339 * If we have less than the fifo 'half-full' threshold to transfer,
9340 * trigger a PIO interrupt as soon as any data is available.
9341 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/net/ethernet/3com/3c59x.c linux-4.14/drivers/net/ethernet/3com/3c59x.c
9342 --- linux-4.14.orig/drivers/net/ethernet/3com/3c59x.c 2017-11-12 19:46:13.000000000 +0100
9343 +++ linux-4.14/drivers/net/ethernet/3com/3c59x.c 2018-09-05 11:05:07.000000000 +0200
9346 struct vortex_private *vp = netdev_priv(dev);
9347 unsigned long flags;
9348 - local_irq_save(flags);
9349 + local_irq_save_nort(flags);
9350 (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
9351 - local_irq_restore(flags);
9352 + local_irq_restore_nort(flags);
9356 @@ -1908,12 +1908,12 @@
9357 * Block interrupts because vortex_interrupt does a bare spin_lock()
9359 unsigned long flags;
9360 - local_irq_save(flags);
9361 + local_irq_save_nort(flags);
9362 if (vp->full_bus_master_tx)
9363 boomerang_interrupt(dev->irq, dev);
9365 vortex_interrupt(dev->irq, dev);
9366 - local_irq_restore(flags);
9367 + local_irq_restore_nort(flags);
9371 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/net/ethernet/marvell/mvpp2.c linux-4.14/drivers/net/ethernet/marvell/mvpp2.c
9372 --- linux-4.14.orig/drivers/net/ethernet/marvell/mvpp2.c 2018-09-05 11:03:22.000000000 +0200
9373 +++ linux-4.14/drivers/net/ethernet/marvell/mvpp2.c 2018-09-05 11:05:07.000000000 +0200
9375 /* Per-CPU port control */
9376 struct mvpp2_port_pcpu {
9377 struct hrtimer tx_done_timer;
9378 + struct net_device *dev;
9379 bool timer_scheduled;
9380 - /* Tasklet for egress finalization */
9381 - struct tasklet_struct tx_done_tasklet;
9384 struct mvpp2_queue_vector {
9385 @@ -5954,46 +5953,34 @@
9389 -static void mvpp2_timer_set(struct mvpp2_port_pcpu *port_pcpu)
9393 - if (!port_pcpu->timer_scheduled) {
9394 - port_pcpu->timer_scheduled = true;
9395 - interval = MVPP2_TXDONE_HRTIMER_PERIOD_NS;
9396 - hrtimer_start(&port_pcpu->tx_done_timer, interval,
9397 - HRTIMER_MODE_REL_PINNED);
9401 -static void mvpp2_tx_proc_cb(unsigned long data)
9402 +static enum hrtimer_restart mvpp2_hr_timer_cb(struct hrtimer *timer)
9404 - struct net_device *dev = (struct net_device *)data;
9405 - struct mvpp2_port *port = netdev_priv(dev);
9406 - struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu);
9407 + struct net_device *dev;
9408 + struct mvpp2_port *port;
9409 + struct mvpp2_port_pcpu *port_pcpu;
9410 unsigned int tx_todo, cause;
9412 + port_pcpu = container_of(timer, struct mvpp2_port_pcpu, tx_done_timer);
9413 + dev = port_pcpu->dev;
9415 if (!netif_running(dev))
9417 + return HRTIMER_NORESTART;
9419 port_pcpu->timer_scheduled = false;
9420 + port = netdev_priv(dev);
9422 /* Process all the Tx queues */
9423 cause = (1 << port->ntxqs) - 1;
9424 tx_todo = mvpp2_tx_done(port, cause, smp_processor_id());
9426 /* Set the timer in case not all the packets were processed */
9428 - mvpp2_timer_set(port_pcpu);
9431 -static enum hrtimer_restart mvpp2_hr_timer_cb(struct hrtimer *timer)
9433 - struct mvpp2_port_pcpu *port_pcpu = container_of(timer,
9434 - struct mvpp2_port_pcpu,
9437 - tasklet_schedule(&port_pcpu->tx_done_tasklet);
9438 + if (tx_todo && !port_pcpu->timer_scheduled) {
9439 + port_pcpu->timer_scheduled = true;
9440 + hrtimer_forward_now(&port_pcpu->tx_done_timer,
9441 + MVPP2_TXDONE_HRTIMER_PERIOD_NS);
9443 + return HRTIMER_RESTART;
9445 return HRTIMER_NORESTART;
9448 @@ -6482,7 +6469,12 @@
9449 txq_pcpu->count > 0) {
9450 struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu);
9452 - mvpp2_timer_set(port_pcpu);
9453 + if (!port_pcpu->timer_scheduled) {
9454 + port_pcpu->timer_scheduled = true;
9455 + hrtimer_start(&port_pcpu->tx_done_timer,
9456 + MVPP2_TXDONE_HRTIMER_PERIOD_NS,
9457 + HRTIMER_MODE_REL_PINNED_SOFT);
9461 return NETDEV_TX_OK;
9462 @@ -6871,7 +6863,6 @@
9464 hrtimer_cancel(&port_pcpu->tx_done_timer);
9465 port_pcpu->timer_scheduled = false;
9466 - tasklet_kill(&port_pcpu->tx_done_tasklet);
9469 mvpp2_cleanup_rxqs(port);
9470 @@ -7644,13 +7635,10 @@
9471 port_pcpu = per_cpu_ptr(port->pcpu, cpu);
9473 hrtimer_init(&port_pcpu->tx_done_timer, CLOCK_MONOTONIC,
9474 - HRTIMER_MODE_REL_PINNED);
9475 + HRTIMER_MODE_REL_PINNED_SOFT);
9476 port_pcpu->tx_done_timer.function = mvpp2_hr_timer_cb;
9477 port_pcpu->timer_scheduled = false;
9479 - tasklet_init(&port_pcpu->tx_done_tasklet,
9481 - (unsigned long)dev);
9482 + port_pcpu->dev = dev;
9486 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/net/wireless/intersil/orinoco/orinoco_usb.c linux-4.14/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
9487 --- linux-4.14.orig/drivers/net/wireless/intersil/orinoco/orinoco_usb.c 2017-11-12 19:46:13.000000000 +0100
9488 +++ linux-4.14/drivers/net/wireless/intersil/orinoco/orinoco_usb.c 2018-09-05 11:05:07.000000000 +0200
9490 while (!ctx->done.done && msecs--)
9493 - wait_event_interruptible(ctx->done.wait,
9494 + swait_event_interruptible(ctx->done.wait,
9498 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/net/wireless/mac80211_hwsim.c linux-4.14/drivers/net/wireless/mac80211_hwsim.c
9499 --- linux-4.14.orig/drivers/net/wireless/mac80211_hwsim.c 2018-09-05 11:03:22.000000000 +0200
9500 +++ linux-4.14/drivers/net/wireless/mac80211_hwsim.c 2018-09-05 11:05:07.000000000 +0200
9502 unsigned int rx_filter;
9503 bool started, idle, scanning;
9505 - struct tasklet_hrtimer beacon_timer;
9506 + struct hrtimer beacon_timer;
9508 PS_DISABLED, PS_ENABLED, PS_AUTO_POLL, PS_MANUAL_POLL
9510 @@ -1423,7 +1423,7 @@
9512 struct mac80211_hwsim_data *data = hw->priv;
9513 data->started = false;
9514 - tasklet_hrtimer_cancel(&data->beacon_timer);
9515 + hrtimer_cancel(&data->beacon_timer);
9516 wiphy_debug(hw->wiphy, "%s\n", __func__);
9519 @@ -1546,14 +1546,12 @@
9520 mac80211_hwsim_beacon(struct hrtimer *timer)
9522 struct mac80211_hwsim_data *data =
9523 - container_of(timer, struct mac80211_hwsim_data,
9524 - beacon_timer.timer);
9525 + container_of(timer, struct mac80211_hwsim_data, beacon_timer);
9526 struct ieee80211_hw *hw = data->hw;
9527 u64 bcn_int = data->beacon_int;
9532 + return HRTIMER_NORESTART;
9534 ieee80211_iterate_active_interfaces_atomic(
9535 hw, IEEE80211_IFACE_ITER_NORMAL,
9536 @@ -1565,11 +1563,9 @@
9537 data->bcn_delta = 0;
9540 - next_bcn = ktime_add(hrtimer_get_expires(timer),
9541 - ns_to_ktime(bcn_int * 1000));
9542 - tasklet_hrtimer_start(&data->beacon_timer, next_bcn, HRTIMER_MODE_ABS);
9544 - return HRTIMER_NORESTART;
9545 + hrtimer_forward(&data->beacon_timer, hrtimer_get_expires(timer),
9546 + ns_to_ktime(bcn_int * NSEC_PER_USEC));
9547 + return HRTIMER_RESTART;
9550 static const char * const hwsim_chanwidths[] = {
9551 @@ -1643,15 +1639,15 @@
9552 mutex_unlock(&data->mutex);
9554 if (!data->started || !data->beacon_int)
9555 - tasklet_hrtimer_cancel(&data->beacon_timer);
9556 - else if (!hrtimer_is_queued(&data->beacon_timer.timer)) {
9557 + hrtimer_cancel(&data->beacon_timer);
9558 + else if (!hrtimer_is_queued(&data->beacon_timer)) {
9559 u64 tsf = mac80211_hwsim_get_tsf(hw, NULL);
9560 u32 bcn_int = data->beacon_int;
9561 u64 until_tbtt = bcn_int - do_div(tsf, bcn_int);
9563 - tasklet_hrtimer_start(&data->beacon_timer,
9564 - ns_to_ktime(until_tbtt * 1000),
9565 - HRTIMER_MODE_REL);
9566 + hrtimer_start(&data->beacon_timer,
9567 + ns_to_ktime(until_tbtt * 1000),
9568 + HRTIMER_MODE_REL_SOFT);
9572 @@ -1714,7 +1710,7 @@
9573 info->enable_beacon, info->beacon_int);
9574 vp->bcn_en = info->enable_beacon;
9575 if (data->started &&
9576 - !hrtimer_is_queued(&data->beacon_timer.timer) &&
9577 + !hrtimer_is_queued(&data->beacon_timer) &&
9578 info->enable_beacon) {
9579 u64 tsf, until_tbtt;
9581 @@ -1722,9 +1718,9 @@
9582 tsf = mac80211_hwsim_get_tsf(hw, vif);
9583 bcn_int = data->beacon_int;
9584 until_tbtt = bcn_int - do_div(tsf, bcn_int);
9585 - tasklet_hrtimer_start(&data->beacon_timer,
9586 - ns_to_ktime(until_tbtt * 1000),
9587 - HRTIMER_MODE_REL);
9588 + hrtimer_start(&data->beacon_timer,
9589 + ns_to_ktime(until_tbtt * 1000),
9590 + HRTIMER_MODE_REL_SOFT);
9591 } else if (!info->enable_beacon) {
9592 unsigned int count = 0;
9593 ieee80211_iterate_active_interfaces_atomic(
9594 @@ -1733,7 +1729,7 @@
9595 wiphy_debug(hw->wiphy, " beaconing vifs remaining: %u",
9598 - tasklet_hrtimer_cancel(&data->beacon_timer);
9599 + hrtimer_cancel(&data->beacon_timer);
9600 data->beacon_int = 0;
9603 @@ -2725,9 +2721,9 @@
9605 data, &hwsim_simulate_radar);
9607 - tasklet_hrtimer_init(&data->beacon_timer,
9608 - mac80211_hwsim_beacon,
9609 - CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
9610 + hrtimer_init(&data->beacon_timer, CLOCK_MONOTONIC,
9611 + HRTIMER_MODE_ABS_SOFT);
9612 + data->beacon_timer.function = mac80211_hwsim_beacon;
9614 spin_lock_bh(&hwsim_radio_lock);
9615 list_add_tail(&data->list, &hwsim_radios);
9616 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/pci/switch/switchtec.c linux-4.14/drivers/pci/switch/switchtec.c
9617 --- linux-4.14.orig/drivers/pci/switch/switchtec.c 2017-11-12 19:46:13.000000000 +0100
9618 +++ linux-4.14/drivers/pci/switch/switchtec.c 2018-09-05 11:05:07.000000000 +0200
9619 @@ -306,10 +306,11 @@
9621 enum mrpc_state state;
9623 - struct completion comp;
9624 + wait_queue_head_t cmd_comp;
9626 struct list_head list;
9633 stuser->stdev = stdev;
9634 kref_init(&stuser->kref);
9635 INIT_LIST_HEAD(&stuser->list);
9636 - init_completion(&stuser->comp);
9637 + init_waitqueue_head(&stuser->cmd_comp);
9638 stuser->event_cnt = atomic_read(&stdev->event_cnt);
9640 dev_dbg(&stdev->dev, "%s: %p\n", __func__, stuser);
9642 kref_get(&stuser->kref);
9643 stuser->read_len = sizeof(stuser->data);
9644 stuser_set_state(stuser, MRPC_QUEUED);
9645 - init_completion(&stuser->comp);
9646 + stuser->cmd_done = false;
9647 list_add_tail(&stuser->list, &stdev->mrpc_queue);
9649 mrpc_cmd_submit(stdev);
9654 - complete_all(&stuser->comp);
9655 + stuser->cmd_done = true;
9656 + wake_up_interruptible(&stuser->cmd_comp);
9657 list_del_init(&stuser->list);
9659 stdev->mrpc_busy = 0;
9660 @@ -721,10 +723,11 @@
9661 mutex_unlock(&stdev->mrpc_mutex);
9663 if (filp->f_flags & O_NONBLOCK) {
9664 - if (!try_wait_for_completion(&stuser->comp))
9665 + if (!READ_ONCE(stuser->cmd_done))
9668 - rc = wait_for_completion_interruptible(&stuser->comp);
9669 + rc = wait_event_interruptible(stuser->cmd_comp,
9670 + stuser->cmd_done);
9675 struct switchtec_dev *stdev = stuser->stdev;
9678 - poll_wait(filp, &stuser->comp.wait, wait);
9679 + poll_wait(filp, &stuser->cmd_comp, wait);
9680 poll_wait(filp, &stdev->event_wq, wait);
9682 if (lock_mutex_and_test_alive(stdev))
9685 mutex_unlock(&stdev->mrpc_mutex);
9687 - if (try_wait_for_completion(&stuser->comp))
9688 + if (READ_ONCE(stuser->cmd_done))
9689 ret |= POLLIN | POLLRDNORM;
9691 if (stuser->event_cnt != atomic_read(&stdev->event_cnt))
9692 @@ -1255,7 +1258,8 @@
9694 /* Wake up and kill any users waiting on an MRPC request */
9695 list_for_each_entry_safe(stuser, tmpuser, &stdev->mrpc_queue, list) {
9696 - complete_all(&stuser->comp);
9697 + stuser->cmd_done = true;
9698 + wake_up_interruptible(&stuser->cmd_comp);
9699 list_del_init(&stuser->list);
9702 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/scsi/fcoe/fcoe.c linux-4.14/drivers/scsi/fcoe/fcoe.c
9703 --- linux-4.14.orig/drivers/scsi/fcoe/fcoe.c 2017-11-12 19:46:13.000000000 +0100
9704 +++ linux-4.14/drivers/scsi/fcoe/fcoe.c 2018-09-05 11:05:07.000000000 +0200
9705 @@ -1464,11 +1464,11 @@
9706 static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
9708 struct fcoe_percpu_s *fps;
9710 + int rc, cpu = get_cpu_light();
9712 - fps = &get_cpu_var(fcoe_percpu);
9713 + fps = &per_cpu(fcoe_percpu, cpu);
9714 rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
9715 - put_cpu_var(fcoe_percpu);
9720 @@ -1655,11 +1655,11 @@
9724 - stats = per_cpu_ptr(lport->stats, get_cpu());
9725 + stats = per_cpu_ptr(lport->stats, get_cpu_light());
9726 stats->InvalidCRCCount++;
9727 if (stats->InvalidCRCCount < 5)
9728 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
9734 @@ -1702,7 +1702,7 @@
9736 hp = (struct fcoe_hdr *) skb_network_header(skb);
9738 - stats = per_cpu_ptr(lport->stats, get_cpu());
9739 + stats = per_cpu_ptr(lport->stats, get_cpu_light());
9740 if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
9741 if (stats->ErrorFrames < 5)
9742 printk(KERN_WARNING "fcoe: FCoE version "
9743 @@ -1734,13 +1734,13 @@
9746 if (!fcoe_filter_frames(lport, fp)) {
9749 fc_exch_recv(lport, fp);
9753 stats->ErrorFrames++;
9759 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/scsi/fcoe/fcoe_ctlr.c linux-4.14/drivers/scsi/fcoe/fcoe_ctlr.c
9760 --- linux-4.14.orig/drivers/scsi/fcoe/fcoe_ctlr.c 2017-11-12 19:46:13.000000000 +0100
9761 +++ linux-4.14/drivers/scsi/fcoe/fcoe_ctlr.c 2018-09-05 11:05:07.000000000 +0200
9764 INIT_LIST_HEAD(&del_list);
9766 - stats = per_cpu_ptr(fip->lp->stats, get_cpu());
9767 + stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
9769 list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
9770 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
9772 sel_time = fcf->time;
9778 list_for_each_entry_safe(fcf, next, &del_list, list) {
9779 /* Removes fcf from current list */
9780 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/scsi/libfc/fc_exch.c linux-4.14/drivers/scsi/libfc/fc_exch.c
9781 --- linux-4.14.orig/drivers/scsi/libfc/fc_exch.c 2017-11-12 19:46:13.000000000 +0100
9782 +++ linux-4.14/drivers/scsi/libfc/fc_exch.c 2018-09-05 11:05:07.000000000 +0200
9783 @@ -833,10 +833,10 @@
9785 memset(ep, 0, sizeof(*ep));
9788 + cpu = get_cpu_light();
9789 pool = per_cpu_ptr(mp->pool, cpu);
9790 spin_lock_bh(&pool->lock);
9794 /* peek cache of free slot */
9795 if (pool->left != FC_XID_UNKNOWN) {
9796 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/scsi/libsas/sas_ata.c linux-4.14/drivers/scsi/libsas/sas_ata.c
9797 --- linux-4.14.orig/drivers/scsi/libsas/sas_ata.c 2017-11-12 19:46:13.000000000 +0100
9798 +++ linux-4.14/drivers/scsi/libsas/sas_ata.c 2018-09-05 11:05:07.000000000 +0200
9800 /* TODO: audit callers to ensure they are ready for qc_issue to
9801 * unconditionally re-enable interrupts
9803 - local_irq_save(flags);
9804 + local_irq_save_nort(flags);
9805 spin_unlock(ap->lock);
9807 /* If the device fell off, no sense in issuing commands */
9811 spin_lock(ap->lock);
9812 - local_irq_restore(flags);
9813 + local_irq_restore_nort(flags);
9817 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/scsi/qla2xxx/qla_inline.h linux-4.14/drivers/scsi/qla2xxx/qla_inline.h
9818 --- linux-4.14.orig/drivers/scsi/qla2xxx/qla_inline.h 2018-09-05 11:03:22.000000000 +0200
9819 +++ linux-4.14/drivers/scsi/qla2xxx/qla_inline.h 2018-09-05 11:05:07.000000000 +0200
9822 unsigned long flags;
9823 struct qla_hw_data *ha = rsp->hw;
9824 - local_irq_save(flags);
9825 + local_irq_save_nort(flags);
9826 if (IS_P3P_TYPE(ha))
9827 qla82xx_poll(0, rsp);
9829 ha->isp_ops->intr_handler(0, rsp);
9830 - local_irq_restore(flags);
9831 + local_irq_restore_nort(flags);
9834 static inline uint8_t *
9835 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/staging/greybus/audio_manager.c linux-4.14/drivers/staging/greybus/audio_manager.c
9836 --- linux-4.14.orig/drivers/staging/greybus/audio_manager.c 2017-11-12 19:46:13.000000000 +0100
9837 +++ linux-4.14/drivers/staging/greybus/audio_manager.c 2018-09-05 11:05:07.000000000 +0200
9839 #include <linux/sysfs.h>
9840 #include <linux/module.h>
9841 #include <linux/init.h>
9842 -#include <linux/rwlock.h>
9843 +#include <linux/spinlock.h>
9844 #include <linux/idr.h>
9846 #include "audio_manager.h"
9847 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/target/target_core_tmr.c linux-4.14/drivers/target/target_core_tmr.c
9848 --- linux-4.14.orig/drivers/target/target_core_tmr.c 2018-09-05 11:03:22.000000000 +0200
9849 +++ linux-4.14/drivers/target/target_core_tmr.c 2018-09-05 11:05:07.000000000 +0200
9852 struct se_session *sess = se_cmd->se_sess;
9854 - assert_spin_locked(&sess->sess_cmd_lock);
9855 - WARN_ON_ONCE(!irqs_disabled());
9857 * If command already reached CMD_T_COMPLETE state within
9858 * target_complete_cmd() or CMD_T_FABRIC_STOP due to shutdown,
9859 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/target/target_core_transport.c linux-4.14/drivers/target/target_core_transport.c
9860 --- linux-4.14.orig/drivers/target/target_core_transport.c 2018-09-05 11:03:22.000000000 +0200
9861 +++ linux-4.14/drivers/target/target_core_transport.c 2018-09-05 11:05:07.000000000 +0200
9862 @@ -2966,9 +2966,6 @@
9863 __acquires(&cmd->t_state_lock)
9866 - assert_spin_locked(&cmd->t_state_lock);
9867 - WARN_ON_ONCE(!irqs_disabled());
9870 cmd->transport_state |= CMD_T_FABRIC_STOP;
9872 @@ -3238,9 +3235,6 @@
9876 - assert_spin_locked(&cmd->t_state_lock);
9877 - WARN_ON_ONCE(!irqs_disabled());
9879 if (!(cmd->transport_state & CMD_T_ABORTED))
9882 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/thermal/x86_pkg_temp_thermal.c linux-4.14/drivers/thermal/x86_pkg_temp_thermal.c
9883 --- linux-4.14.orig/drivers/thermal/x86_pkg_temp_thermal.c 2017-11-12 19:46:13.000000000 +0100
9884 +++ linux-4.14/drivers/thermal/x86_pkg_temp_thermal.c 2018-09-05 11:05:07.000000000 +0200
9886 #include <linux/pm.h>
9887 #include <linux/thermal.h>
9888 #include <linux/debugfs.h>
9889 +#include <linux/swork.h>
9890 #include <asm/cpu_device_id.h>
9891 #include <asm/mce.h>
9894 schedule_delayed_work_on(cpu, work, ms);
9897 -static int pkg_thermal_notify(u64 msr_val)
9898 +static void pkg_thermal_notify_work(struct swork_event *event)
9900 int cpu = smp_processor_id();
9901 struct pkg_device *pkgdev;
9902 @@ -348,8 +349,46 @@
9905 spin_unlock_irqrestore(&pkg_temp_lock, flags);
9908 +#ifdef CONFIG_PREEMPT_RT_FULL
9909 +static struct swork_event notify_work;
9911 +static int pkg_thermal_notify_work_init(void)
9915 + err = swork_get();
9919 + INIT_SWORK(¬ify_work, pkg_thermal_notify_work);
9923 +static void pkg_thermal_notify_work_cleanup(void)
9928 +static int pkg_thermal_notify(u64 msr_val)
9930 + swork_queue(¬ify_work);
9934 +#else /* !CONFIG_PREEMPT_RT_FULL */
9936 +static int pkg_thermal_notify_work_init(void) { return 0; }
9938 +static void pkg_thermal_notify_work_cleanup(void) { }
9940 +static int pkg_thermal_notify(u64 msr_val)
9942 + pkg_thermal_notify_work(NULL);
9945 +#endif /* CONFIG_PREEMPT_RT_FULL */
9947 static int pkg_temp_thermal_device_add(unsigned int cpu)
9949 @@ -515,10 +554,15 @@
9950 if (!x86_match_cpu(pkg_temp_thermal_ids))
9953 + if (!pkg_thermal_notify_work_init())
9956 max_packages = topology_max_packages();
9957 packages = kzalloc(max_packages * sizeof(struct pkg_device *), GFP_KERNEL);
9965 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online",
9966 pkg_thermal_cpu_online, pkg_thermal_cpu_offline);
9971 + pkg_thermal_notify_work_cleanup();
9976 cpuhp_remove_state(pkg_thermal_hp_state);
9977 debugfs_remove_recursive(debugfs);
9979 + pkg_thermal_notify_work_cleanup();
9981 module_exit(pkg_temp_thermal_exit)
9983 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/tty/serial/8250/8250_core.c linux-4.14/drivers/tty/serial/8250/8250_core.c
9984 --- linux-4.14.orig/drivers/tty/serial/8250/8250_core.c 2017-11-12 19:46:13.000000000 +0100
9985 +++ linux-4.14/drivers/tty/serial/8250/8250_core.c 2018-09-05 11:05:07.000000000 +0200
9988 static unsigned int skip_txen_test; /* force skip of txen test at init time */
9990 -#define PASS_LIMIT 512
9992 + * On -rt we can have a more delays, and legitimately
9993 + * so - so don't drop work spuriously and spam the
9996 +#ifdef CONFIG_PREEMPT_RT_FULL
9997 +# define PASS_LIMIT 1000000
9999 +# define PASS_LIMIT 512
10002 #include <asm/serial.h>
10004 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/tty/serial/8250/8250_port.c linux-4.14/drivers/tty/serial/8250/8250_port.c
10005 --- linux-4.14.orig/drivers/tty/serial/8250/8250_port.c 2018-09-05 11:03:22.000000000 +0200
10006 +++ linux-4.14/drivers/tty/serial/8250/8250_port.c 2018-09-05 11:05:07.000000000 +0200
10008 #include <linux/nmi.h>
10009 #include <linux/mutex.h>
10010 #include <linux/slab.h>
10011 +#include <linux/kdb.h>
10012 #include <linux/uaccess.h>
10013 #include <linux/pm_runtime.h>
10014 #include <linux/ktime.h>
10015 @@ -3224,9 +3225,9 @@
10017 serial8250_rpm_get(up);
10020 + if (port->sysrq || oops_in_progress)
10022 - else if (oops_in_progress)
10023 + else if (in_kdb_printk())
10024 locked = spin_trylock_irqsave(&port->lock, flags);
10026 spin_lock_irqsave(&port->lock, flags);
10027 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/tty/serial/amba-pl011.c linux-4.14/drivers/tty/serial/amba-pl011.c
10028 --- linux-4.14.orig/drivers/tty/serial/amba-pl011.c 2018-09-05 11:03:22.000000000 +0200
10029 +++ linux-4.14/drivers/tty/serial/amba-pl011.c 2018-09-05 11:05:07.000000000 +0200
10030 @@ -2236,13 +2236,19 @@
10032 clk_enable(uap->clk);
10034 - local_irq_save(flags);
10036 + * local_irq_save(flags);
10038 + * This local_irq_save() is nonsense. If we come in via sysrq
10039 + * handling then interrupts are already disabled. Aside of
10040 + * that the port.sysrq check is racy on SMP regardless.
10042 if (uap->port.sysrq)
10044 else if (oops_in_progress)
10045 - locked = spin_trylock(&uap->port.lock);
10046 + locked = spin_trylock_irqsave(&uap->port.lock, flags);
10048 - spin_lock(&uap->port.lock);
10049 + spin_lock_irqsave(&uap->port.lock, flags);
10052 * First save the CR then disable the interrupts
10053 @@ -2268,8 +2274,7 @@
10054 pl011_write(old_cr, uap, REG_CR);
10057 - spin_unlock(&uap->port.lock);
10058 - local_irq_restore(flags);
10059 + spin_unlock_irqrestore(&uap->port.lock, flags);
10061 clk_disable(uap->clk);
10063 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/tty/serial/omap-serial.c linux-4.14/drivers/tty/serial/omap-serial.c
10064 --- linux-4.14.orig/drivers/tty/serial/omap-serial.c 2018-09-05 11:03:22.000000000 +0200
10065 +++ linux-4.14/drivers/tty/serial/omap-serial.c 2018-09-05 11:05:07.000000000 +0200
10066 @@ -1311,13 +1311,10 @@
10068 pm_runtime_get_sync(up->dev);
10070 - local_irq_save(flags);
10071 - if (up->port.sysrq)
10073 - else if (oops_in_progress)
10074 - locked = spin_trylock(&up->port.lock);
10075 + if (up->port.sysrq || oops_in_progress)
10076 + locked = spin_trylock_irqsave(&up->port.lock, flags);
10078 - spin_lock(&up->port.lock);
10079 + spin_lock_irqsave(&up->port.lock, flags);
10082 * First save the IER then disable the interrupts
10083 @@ -1346,8 +1343,7 @@
10084 pm_runtime_mark_last_busy(up->dev);
10085 pm_runtime_put_autosuspend(up->dev);
10087 - spin_unlock(&up->port.lock);
10088 - local_irq_restore(flags);
10089 + spin_unlock_irqrestore(&up->port.lock, flags);
10093 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/usb/core/hcd.c linux-4.14/drivers/usb/core/hcd.c
10094 --- linux-4.14.orig/drivers/usb/core/hcd.c 2018-09-05 11:03:22.000000000 +0200
10095 +++ linux-4.14/drivers/usb/core/hcd.c 2018-09-05 11:05:07.000000000 +0200
10096 @@ -1775,9 +1775,9 @@
10097 * and no one may trigger the above deadlock situation when
10098 * running complete() in tasklet.
10100 - local_irq_save(flags);
10101 + local_irq_save_nort(flags);
10102 urb->complete(urb);
10103 - local_irq_restore(flags);
10104 + local_irq_restore_nort(flags);
10106 usb_anchor_resume_wakeups(anchor);
10107 atomic_dec(&urb->use_count);
10108 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/usb/gadget/function/f_fs.c linux-4.14/drivers/usb/gadget/function/f_fs.c
10109 --- linux-4.14.orig/drivers/usb/gadget/function/f_fs.c 2018-09-05 11:03:22.000000000 +0200
10110 +++ linux-4.14/drivers/usb/gadget/function/f_fs.c 2018-09-05 11:05:07.000000000 +0200
10111 @@ -1623,7 +1623,7 @@
10112 pr_info("%s(): freeing\n", __func__);
10113 ffs_data_clear(ffs);
10114 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
10115 - waitqueue_active(&ffs->ep0req_completion.wait) ||
10116 + swait_active(&ffs->ep0req_completion.wait) ||
10117 waitqueue_active(&ffs->wait));
10118 destroy_workqueue(ffs->io_completion_wq);
10119 kfree(ffs->dev_name);
10120 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/usb/gadget/function/f_ncm.c linux-4.14/drivers/usb/gadget/function/f_ncm.c
10121 --- linux-4.14.orig/drivers/usb/gadget/function/f_ncm.c 2017-11-12 19:46:13.000000000 +0100
10122 +++ linux-4.14/drivers/usb/gadget/function/f_ncm.c 2018-09-05 11:05:07.000000000 +0200
10124 struct sk_buff *skb_tx_ndp;
10125 u16 ndp_dgram_count;
10126 bool timer_force_tx;
10127 - struct tasklet_struct tx_tasklet;
10128 struct hrtimer task_timer;
10130 bool timer_stopping;
10133 @@ -1108,7 +1106,7 @@
10135 /* Delay the timer. */
10136 hrtimer_start(&ncm->task_timer, TX_TIMEOUT_NSECS,
10137 - HRTIMER_MODE_REL);
10138 + HRTIMER_MODE_REL_SOFT);
10140 /* Add the datagram position entries */
10141 ntb_ndp = skb_put_zero(ncm->skb_tx_ndp, dgram_idx_len);
10142 @@ -1152,17 +1150,15 @@
10146 - * This transmits the NTB if there are frames waiting.
10147 + * The transmit should only be run if no skb data has been sent
10148 + * for a certain duration.
10150 -static void ncm_tx_tasklet(unsigned long data)
10151 +static enum hrtimer_restart ncm_tx_timeout(struct hrtimer *data)
10153 - struct f_ncm *ncm = (void *)data;
10155 - if (ncm->timer_stopping)
10157 + struct f_ncm *ncm = container_of(data, struct f_ncm, task_timer);
10159 /* Only send if data is available. */
10160 - if (ncm->skb_tx_data) {
10161 + if (!ncm->timer_stopping && ncm->skb_tx_data) {
10162 ncm->timer_force_tx = true;
10164 /* XXX This allowance of a NULL skb argument to ndo_start_xmit
10165 @@ -1175,16 +1171,6 @@
10167 ncm->timer_force_tx = false;
10172 - * The transmit should only be run if no skb data has been sent
10173 - * for a certain duration.
10175 -static enum hrtimer_restart ncm_tx_timeout(struct hrtimer *data)
10177 - struct f_ncm *ncm = container_of(data, struct f_ncm, task_timer);
10178 - tasklet_schedule(&ncm->tx_tasklet);
10179 return HRTIMER_NORESTART;
10182 @@ -1517,8 +1503,7 @@
10183 ncm->port.open = ncm_open;
10184 ncm->port.close = ncm_close;
10186 - tasklet_init(&ncm->tx_tasklet, ncm_tx_tasklet, (unsigned long) ncm);
10187 - hrtimer_init(&ncm->task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
10188 + hrtimer_init(&ncm->task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
10189 ncm->task_timer.function = ncm_tx_timeout;
10191 DBG(cdev, "CDC Network: %s speed IN/%s OUT/%s NOTIFY/%s\n",
10192 @@ -1627,7 +1612,6 @@
10193 DBG(c->cdev, "ncm unbind\n");
10195 hrtimer_cancel(&ncm->task_timer);
10196 - tasklet_kill(&ncm->tx_tasklet);
10198 ncm_string_defs[0].id = 0;
10199 usb_free_all_descriptors(f);
10200 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/usb/gadget/legacy/inode.c linux-4.14/drivers/usb/gadget/legacy/inode.c
10201 --- linux-4.14.orig/drivers/usb/gadget/legacy/inode.c 2017-11-12 19:46:13.000000000 +0100
10202 +++ linux-4.14/drivers/usb/gadget/legacy/inode.c 2018-09-05 11:05:07.000000000 +0200
10203 @@ -347,7 +347,7 @@
10204 spin_unlock_irq (&epdata->dev->lock);
10206 if (likely (value == 0)) {
10207 - value = wait_event_interruptible (done.wait, done.done);
10208 + value = swait_event_interruptible (done.wait, done.done);
10210 spin_lock_irq (&epdata->dev->lock);
10211 if (likely (epdata->ep != NULL)) {
10212 @@ -356,7 +356,7 @@
10213 usb_ep_dequeue (epdata->ep, epdata->req);
10214 spin_unlock_irq (&epdata->dev->lock);
10216 - wait_event (done.wait, done.done);
10217 + swait_event (done.wait, done.done);
10218 if (epdata->status == -ECONNRESET)
10219 epdata->status = -EINTR;
10221 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/aio.c linux-4.14/fs/aio.c
10222 --- linux-4.14.orig/fs/aio.c 2018-09-05 11:03:22.000000000 +0200
10223 +++ linux-4.14/fs/aio.c 2018-09-05 11:05:07.000000000 +0200
10225 #include <linux/ramfs.h>
10226 #include <linux/percpu-refcount.h>
10227 #include <linux/mount.h>
10228 +#include <linux/swork.h>
10230 #include <asm/kmap_types.h>
10231 #include <linux/uaccess.h>
10232 @@ -117,6 +118,7 @@
10234 struct rcu_head free_rcu;
10235 struct work_struct free_work; /* see free_ioctx() */
10236 + struct swork_event free_swork; /* see free_ioctx() */
10239 * signals when all in-flight requests are done
10240 @@ -259,6 +261,7 @@
10241 .mount = aio_mount,
10242 .kill_sb = kill_anon_super,
10244 + BUG_ON(swork_get());
10245 aio_mnt = kern_mount(&aio_fs);
10246 if (IS_ERR(aio_mnt))
10247 panic("Failed to create aio fs mount.");
10248 @@ -633,9 +636,9 @@
10249 * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
10250 * now it's safe to cancel any that need to be.
10252 -static void free_ioctx_users(struct percpu_ref *ref)
10253 +static void free_ioctx_users_work(struct swork_event *sev)
10255 - struct kioctx *ctx = container_of(ref, struct kioctx, users);
10256 + struct kioctx *ctx = container_of(sev, struct kioctx, free_swork);
10257 struct aio_kiocb *req;
10259 spin_lock_irq(&ctx->ctx_lock);
10260 @@ -653,6 +656,14 @@
10261 percpu_ref_put(&ctx->reqs);
10264 +static void free_ioctx_users(struct percpu_ref *ref)
10266 + struct kioctx *ctx = container_of(ref, struct kioctx, users);
10268 + INIT_SWORK(&ctx->free_swork, free_ioctx_users_work);
10269 + swork_queue(&ctx->free_swork);
10272 static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
10274 unsigned i, new_nr;
10275 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/autofs4/autofs_i.h linux-4.14/fs/autofs4/autofs_i.h
10276 --- linux-4.14.orig/fs/autofs4/autofs_i.h 2017-11-12 19:46:13.000000000 +0100
10277 +++ linux-4.14/fs/autofs4/autofs_i.h 2018-09-05 11:05:07.000000000 +0200
10279 #include <linux/sched.h>
10280 #include <linux/mount.h>
10281 #include <linux/namei.h>
10282 +#include <linux/delay.h>
10283 #include <linux/uaccess.h>
10284 #include <linux/mutex.h>
10285 #include <linux/spinlock.h>
10286 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/autofs4/expire.c linux-4.14/fs/autofs4/expire.c
10287 --- linux-4.14.orig/fs/autofs4/expire.c 2017-11-12 19:46:13.000000000 +0100
10288 +++ linux-4.14/fs/autofs4/expire.c 2018-09-05 11:05:07.000000000 +0200
10289 @@ -148,7 +148,7 @@
10290 parent = p->d_parent;
10291 if (!spin_trylock(&parent->d_lock)) {
10292 spin_unlock(&p->d_lock);
10297 spin_unlock(&p->d_lock);
10298 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/buffer.c linux-4.14/fs/buffer.c
10299 --- linux-4.14.orig/fs/buffer.c 2018-09-05 11:03:22.000000000 +0200
10300 +++ linux-4.14/fs/buffer.c 2018-09-05 11:05:07.000000000 +0200
10301 @@ -302,8 +302,7 @@
10302 * decide that the page is now completely done.
10304 first = page_buffers(page);
10305 - local_irq_save(flags);
10306 - bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
10307 + flags = bh_uptodate_lock_irqsave(first);
10308 clear_buffer_async_read(bh);
10311 @@ -316,8 +315,7 @@
10313 tmp = tmp->b_this_page;
10314 } while (tmp != bh);
10315 - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10316 - local_irq_restore(flags);
10317 + bh_uptodate_unlock_irqrestore(first, flags);
10320 * If none of the buffers had errors and they are all
10321 @@ -329,9 +327,7 @@
10325 - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10326 - local_irq_restore(flags);
10328 + bh_uptodate_unlock_irqrestore(first, flags);
10332 @@ -358,8 +354,7 @@
10335 first = page_buffers(page);
10336 - local_irq_save(flags);
10337 - bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
10338 + flags = bh_uptodate_lock_irqsave(first);
10340 clear_buffer_async_write(bh);
10342 @@ -371,15 +366,12 @@
10344 tmp = tmp->b_this_page;
10346 - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10347 - local_irq_restore(flags);
10348 + bh_uptodate_unlock_irqrestore(first, flags);
10349 end_page_writeback(page);
10353 - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10354 - local_irq_restore(flags);
10356 + bh_uptodate_unlock_irqrestore(first, flags);
10358 EXPORT_SYMBOL(end_buffer_async_write);
10360 @@ -3417,6 +3409,7 @@
10361 struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
10363 INIT_LIST_HEAD(&ret->b_assoc_buffers);
10364 + buffer_head_init_locks(ret);
10366 __this_cpu_inc(bh_accounting.nr);
10368 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/cifs/readdir.c linux-4.14/fs/cifs/readdir.c
10369 --- linux-4.14.orig/fs/cifs/readdir.c 2017-11-12 19:46:13.000000000 +0100
10370 +++ linux-4.14/fs/cifs/readdir.c 2018-09-05 11:05:07.000000000 +0200
10372 struct inode *inode;
10373 struct super_block *sb = parent->d_sb;
10374 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
10375 - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10376 + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10378 cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
10380 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/dcache.c linux-4.14/fs/dcache.c
10381 --- linux-4.14.orig/fs/dcache.c 2018-09-05 11:03:29.000000000 +0200
10382 +++ linux-4.14/fs/dcache.c 2018-09-05 11:05:07.000000000 +0200
10384 #include <linux/mm.h>
10385 #include <linux/fs.h>
10386 #include <linux/fsnotify.h>
10387 +#include <linux/delay.h>
10388 #include <linux/slab.h>
10389 #include <linux/init.h>
10390 #include <linux/hash.h>
10391 @@ -793,6 +794,8 @@
10393 void dput(struct dentry *dentry)
10395 + struct dentry *parent;
10397 if (unlikely(!dentry))
10400 @@ -829,9 +832,18 @@
10404 - dentry = dentry_kill(dentry);
10407 + parent = dentry_kill(dentry);
10411 + if (parent == dentry) {
10412 + /* the task with the highest priority won't schedule */
10413 + r = cond_resched();
10422 @@ -2394,7 +2406,7 @@
10423 if (dentry->d_lockref.count == 1) {
10424 if (!spin_trylock(&inode->i_lock)) {
10425 spin_unlock(&dentry->d_lock);
10430 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
10431 @@ -2439,9 +2451,10 @@
10432 static inline unsigned start_dir_add(struct inode *dir)
10435 + preempt_disable_rt();
10437 - unsigned n = dir->i_dir_seq;
10438 - if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
10439 + unsigned n = dir->__i_dir_seq;
10440 + if (!(n & 1) && cmpxchg(&dir->__i_dir_seq, n, n + 1) == n)
10444 @@ -2449,26 +2462,30 @@
10446 static inline void end_dir_add(struct inode *dir, unsigned n)
10448 - smp_store_release(&dir->i_dir_seq, n + 2);
10449 + smp_store_release(&dir->__i_dir_seq, n + 2);
10450 + preempt_enable_rt();
10453 static void d_wait_lookup(struct dentry *dentry)
10455 - if (d_in_lookup(dentry)) {
10456 - DECLARE_WAITQUEUE(wait, current);
10457 - add_wait_queue(dentry->d_wait, &wait);
10459 - set_current_state(TASK_UNINTERRUPTIBLE);
10460 - spin_unlock(&dentry->d_lock);
10462 - spin_lock(&dentry->d_lock);
10463 - } while (d_in_lookup(dentry));
10465 + struct swait_queue __wait;
10467 + if (!d_in_lookup(dentry))
10470 + INIT_LIST_HEAD(&__wait.task_list);
10472 + prepare_to_swait(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE);
10473 + spin_unlock(&dentry->d_lock);
10475 + spin_lock(&dentry->d_lock);
10476 + } while (d_in_lookup(dentry));
10477 + finish_swait(dentry->d_wait, &__wait);
10480 struct dentry *d_alloc_parallel(struct dentry *parent,
10481 const struct qstr *name,
10482 - wait_queue_head_t *wq)
10483 + struct swait_queue_head *wq)
10485 unsigned int hash = name->hash;
10486 struct hlist_bl_head *b = in_lookup_hash(parent, hash);
10487 @@ -2482,7 +2499,7 @@
10491 - seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
10492 + seq = smp_load_acquire(&parent->d_inode->__i_dir_seq);
10493 r_seq = read_seqbegin(&rename_lock);
10494 dentry = __d_lookup_rcu(parent, name, &d_seq);
10495 if (unlikely(dentry)) {
10496 @@ -2510,7 +2527,7 @@
10500 - if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
10501 + if (unlikely(READ_ONCE(parent->d_inode->__i_dir_seq) != seq)) {
10502 hlist_bl_unlock(b);
10505 @@ -2583,7 +2600,7 @@
10507 dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
10508 __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
10509 - wake_up_all(dentry->d_wait);
10510 + swake_up_all(dentry->d_wait);
10511 dentry->d_wait = NULL;
10512 hlist_bl_unlock(b);
10513 INIT_HLIST_NODE(&dentry->d_u.d_alias);
10514 @@ -3619,6 +3636,8 @@
10516 static void __init dcache_init_early(void)
10518 + unsigned int loop;
10520 /* If hashes are distributed across NUMA nodes, defer
10521 * hash allocation until vmalloc space is available.
10523 @@ -3635,10 +3654,14 @@
10528 + for (loop = 0; loop < (1U << d_hash_shift); loop++)
10529 + INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
10532 static void __init dcache_init(void)
10534 + unsigned int loop;
10536 * A constructor could be added for stable state like the lists,
10537 * but it is probably not worth it because of the cache nature
10538 @@ -3661,6 +3684,10 @@
10543 + for (loop = 0; loop < (1U << d_hash_shift); loop++)
10544 + INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
10548 /* SLAB cache for __getname() consumers */
10549 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/eventpoll.c linux-4.14/fs/eventpoll.c
10550 --- linux-4.14.orig/fs/eventpoll.c 2017-11-12 19:46:13.000000000 +0100
10551 +++ linux-4.14/fs/eventpoll.c 2018-09-05 11:05:07.000000000 +0200
10552 @@ -587,12 +587,12 @@
10554 static void ep_poll_safewake(wait_queue_head_t *wq)
10556 - int this_cpu = get_cpu();
10557 + int this_cpu = get_cpu_light();
10559 ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
10560 ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
10566 static void ep_remove_wait_queue(struct eppoll_entry *pwq)
10567 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/exec.c linux-4.14/fs/exec.c
10568 --- linux-4.14.orig/fs/exec.c 2018-09-05 11:03:29.000000000 +0200
10569 +++ linux-4.14/fs/exec.c 2018-09-05 11:05:07.000000000 +0200
10570 @@ -1025,12 +1025,14 @@
10574 + preempt_disable_rt();
10575 active_mm = tsk->active_mm;
10577 tsk->active_mm = mm;
10578 activate_mm(active_mm, mm);
10579 tsk->mm->vmacache_seqnum = 0;
10580 vmacache_flush(tsk);
10581 + preempt_enable_rt();
10584 up_read(&old_mm->mmap_sem);
10585 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/ext4/page-io.c linux-4.14/fs/ext4/page-io.c
10586 --- linux-4.14.orig/fs/ext4/page-io.c 2017-11-12 19:46:13.000000000 +0100
10587 +++ linux-4.14/fs/ext4/page-io.c 2018-09-05 11:05:07.000000000 +0200
10589 * We check all buffers in the page under BH_Uptodate_Lock
10590 * to avoid races with other end io clearing async_write flags
10592 - local_irq_save(flags);
10593 - bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
10594 + flags = bh_uptodate_lock_irqsave(head);
10596 if (bh_offset(bh) < bio_start ||
10597 bh_offset(bh) + bh->b_size > bio_end) {
10598 @@ -108,8 +107,7 @@
10599 if (bio->bi_status)
10600 buffer_io_error(bh);
10601 } while ((bh = bh->b_this_page) != head);
10602 - bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
10603 - local_irq_restore(flags);
10604 + bh_uptodate_unlock_irqrestore(head, flags);
10606 #ifdef CONFIG_EXT4_FS_ENCRYPTION
10608 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/fuse/dir.c linux-4.14/fs/fuse/dir.c
10609 --- linux-4.14.orig/fs/fuse/dir.c 2018-09-05 11:03:22.000000000 +0200
10610 +++ linux-4.14/fs/fuse/dir.c 2018-09-05 11:05:07.000000000 +0200
10611 @@ -1187,7 +1187,7 @@
10612 struct inode *dir = d_inode(parent);
10613 struct fuse_conn *fc;
10614 struct inode *inode;
10615 - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10616 + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10620 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/inode.c linux-4.14/fs/inode.c
10621 --- linux-4.14.orig/fs/inode.c 2018-09-05 11:03:29.000000000 +0200
10622 +++ linux-4.14/fs/inode.c 2018-09-05 11:05:07.000000000 +0200
10623 @@ -154,7 +154,7 @@
10624 inode->i_bdev = NULL;
10625 inode->i_cdev = NULL;
10626 inode->i_link = NULL;
10627 - inode->i_dir_seq = 0;
10628 + inode->__i_dir_seq = 0;
10630 inode->dirtied_when = 0;
10632 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/libfs.c linux-4.14/fs/libfs.c
10633 --- linux-4.14.orig/fs/libfs.c 2017-11-12 19:46:13.000000000 +0100
10634 +++ linux-4.14/fs/libfs.c 2018-09-05 11:05:07.000000000 +0200
10636 struct list_head *from,
10639 - unsigned *seq = &parent->d_inode->i_dir_seq, n;
10640 + unsigned *seq = &parent->d_inode->__i_dir_seq, n;
10641 struct dentry *res;
10642 struct list_head *p;
10644 @@ -123,8 +123,9 @@
10645 static void move_cursor(struct dentry *cursor, struct list_head *after)
10647 struct dentry *parent = cursor->d_parent;
10648 - unsigned n, *seq = &parent->d_inode->i_dir_seq;
10649 + unsigned n, *seq = &parent->d_inode->__i_dir_seq;
10650 spin_lock(&parent->d_lock);
10651 + preempt_disable_rt();
10654 if (!(n & 1) && cmpxchg(seq, n, n + 1) == n)
10655 @@ -137,6 +138,7 @@
10657 list_add_tail(&cursor->d_child, &parent->d_subdirs);
10658 smp_store_release(seq, n + 2);
10659 + preempt_enable_rt();
10660 spin_unlock(&parent->d_lock);
10663 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/locks.c linux-4.14/fs/locks.c
10664 --- linux-4.14.orig/fs/locks.c 2017-11-12 19:46:13.000000000 +0100
10665 +++ linux-4.14/fs/locks.c 2018-09-05 11:05:07.000000000 +0200
10666 @@ -945,7 +945,7 @@
10670 - percpu_down_read_preempt_disable(&file_rwsem);
10671 + percpu_down_read(&file_rwsem);
10672 spin_lock(&ctx->flc_lock);
10673 if (request->fl_flags & FL_ACCESS)
10674 goto find_conflict;
10675 @@ -986,7 +986,7 @@
10678 spin_unlock(&ctx->flc_lock);
10679 - percpu_up_read_preempt_enable(&file_rwsem);
10680 + percpu_up_read(&file_rwsem);
10682 locks_free_lock(new_fl);
10683 locks_dispose_list(&dispose);
10684 @@ -1023,7 +1023,7 @@
10685 new_fl2 = locks_alloc_lock();
10688 - percpu_down_read_preempt_disable(&file_rwsem);
10689 + percpu_down_read(&file_rwsem);
10690 spin_lock(&ctx->flc_lock);
10692 * New lock request. Walk all POSIX locks and look for conflicts. If
10693 @@ -1195,7 +1195,7 @@
10696 spin_unlock(&ctx->flc_lock);
10697 - percpu_up_read_preempt_enable(&file_rwsem);
10698 + percpu_up_read(&file_rwsem);
10700 * Free any unused locks.
10702 @@ -1470,7 +1470,7 @@
10706 - percpu_down_read_preempt_disable(&file_rwsem);
10707 + percpu_down_read(&file_rwsem);
10708 spin_lock(&ctx->flc_lock);
10710 time_out_leases(inode, &dispose);
10711 @@ -1522,13 +1522,13 @@
10712 locks_insert_block(fl, new_fl);
10713 trace_break_lease_block(inode, new_fl);
10714 spin_unlock(&ctx->flc_lock);
10715 - percpu_up_read_preempt_enable(&file_rwsem);
10716 + percpu_up_read(&file_rwsem);
10718 locks_dispose_list(&dispose);
10719 error = wait_event_interruptible_timeout(new_fl->fl_wait,
10720 !new_fl->fl_next, break_time);
10722 - percpu_down_read_preempt_disable(&file_rwsem);
10723 + percpu_down_read(&file_rwsem);
10724 spin_lock(&ctx->flc_lock);
10725 trace_break_lease_unblock(inode, new_fl);
10726 locks_delete_block(new_fl);
10727 @@ -1545,7 +1545,7 @@
10730 spin_unlock(&ctx->flc_lock);
10731 - percpu_up_read_preempt_enable(&file_rwsem);
10732 + percpu_up_read(&file_rwsem);
10733 locks_dispose_list(&dispose);
10734 locks_free_lock(new_fl);
10736 @@ -1619,7 +1619,7 @@
10738 ctx = smp_load_acquire(&inode->i_flctx);
10739 if (ctx && !list_empty_careful(&ctx->flc_lease)) {
10740 - percpu_down_read_preempt_disable(&file_rwsem);
10741 + percpu_down_read(&file_rwsem);
10742 spin_lock(&ctx->flc_lock);
10743 time_out_leases(inode, &dispose);
10744 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
10745 @@ -1629,7 +1629,7 @@
10748 spin_unlock(&ctx->flc_lock);
10749 - percpu_up_read_preempt_enable(&file_rwsem);
10750 + percpu_up_read(&file_rwsem);
10752 locks_dispose_list(&dispose);
10754 @@ -1704,7 +1704,7 @@
10758 - percpu_down_read_preempt_disable(&file_rwsem);
10759 + percpu_down_read(&file_rwsem);
10760 spin_lock(&ctx->flc_lock);
10761 time_out_leases(inode, &dispose);
10762 error = check_conflicting_open(dentry, arg, lease->fl_flags);
10763 @@ -1775,7 +1775,7 @@
10764 lease->fl_lmops->lm_setup(lease, priv);
10766 spin_unlock(&ctx->flc_lock);
10767 - percpu_up_read_preempt_enable(&file_rwsem);
10768 + percpu_up_read(&file_rwsem);
10769 locks_dispose_list(&dispose);
10771 inode_unlock(inode);
10772 @@ -1798,7 +1798,7 @@
10776 - percpu_down_read_preempt_disable(&file_rwsem);
10777 + percpu_down_read(&file_rwsem);
10778 spin_lock(&ctx->flc_lock);
10779 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
10780 if (fl->fl_file == filp &&
10781 @@ -1811,7 +1811,7 @@
10783 error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
10784 spin_unlock(&ctx->flc_lock);
10785 - percpu_up_read_preempt_enable(&file_rwsem);
10786 + percpu_up_read(&file_rwsem);
10787 locks_dispose_list(&dispose);
10790 @@ -2535,13 +2535,13 @@
10791 if (list_empty(&ctx->flc_lease))
10794 - percpu_down_read_preempt_disable(&file_rwsem);
10795 + percpu_down_read(&file_rwsem);
10796 spin_lock(&ctx->flc_lock);
10797 list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
10798 if (filp == fl->fl_file)
10799 lease_modify(fl, F_UNLCK, &dispose);
10800 spin_unlock(&ctx->flc_lock);
10801 - percpu_up_read_preempt_enable(&file_rwsem);
10802 + percpu_up_read(&file_rwsem);
10804 locks_dispose_list(&dispose);
10806 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/namei.c linux-4.14/fs/namei.c
10807 --- linux-4.14.orig/fs/namei.c 2018-09-05 11:03:22.000000000 +0200
10808 +++ linux-4.14/fs/namei.c 2018-09-05 11:05:07.000000000 +0200
10809 @@ -1627,7 +1627,7 @@
10811 struct dentry *dentry = ERR_PTR(-ENOENT), *old;
10812 struct inode *inode = dir->d_inode;
10813 - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10814 + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10816 inode_lock_shared(inode);
10817 /* Don't go there if it's already dead */
10818 @@ -3100,7 +3100,7 @@
10819 struct dentry *dentry;
10820 int error, create_error = 0;
10821 umode_t mode = op->mode;
10822 - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10823 + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10825 if (unlikely(IS_DEADDIR(dir_inode)))
10827 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/namespace.c linux-4.14/fs/namespace.c
10828 --- linux-4.14.orig/fs/namespace.c 2018-09-05 11:03:29.000000000 +0200
10829 +++ linux-4.14/fs/namespace.c 2018-09-05 11:05:07.000000000 +0200
10831 #include <linux/mnt_namespace.h>
10832 #include <linux/user_namespace.h>
10833 #include <linux/namei.h>
10834 +#include <linux/delay.h>
10835 #include <linux/security.h>
10836 #include <linux/cred.h>
10837 #include <linux/idr.h>
10838 @@ -353,8 +354,11 @@
10839 * incremented count after it has set MNT_WRITE_HOLD.
10842 - while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
10844 + while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
10845 + preempt_enable();
10847 + preempt_disable();
10850 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
10851 * be set to match its requirements. So we must not load that until
10852 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/delegation.c linux-4.14/fs/nfs/delegation.c
10853 --- linux-4.14.orig/fs/nfs/delegation.c 2017-11-12 19:46:13.000000000 +0100
10854 +++ linux-4.14/fs/nfs/delegation.c 2018-09-05 11:05:07.000000000 +0200
10855 @@ -150,11 +150,11 @@
10857 /* Block nfs4_proc_unlck */
10858 mutex_lock(&sp->so_delegreturn_mutex);
10859 - seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
10860 + seq = read_seqbegin(&sp->so_reclaim_seqlock);
10861 err = nfs4_open_delegation_recall(ctx, state, stateid, type);
10863 err = nfs_delegation_claim_locks(ctx, state, stateid);
10864 - if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
10865 + if (!err && read_seqretry(&sp->so_reclaim_seqlock, seq))
10867 mutex_unlock(&sp->so_delegreturn_mutex);
10868 put_nfs_open_context(ctx);
10869 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/dir.c linux-4.14/fs/nfs/dir.c
10870 --- linux-4.14.orig/fs/nfs/dir.c 2018-09-05 11:03:22.000000000 +0200
10871 +++ linux-4.14/fs/nfs/dir.c 2018-09-05 11:05:07.000000000 +0200
10872 @@ -452,7 +452,7 @@
10873 void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
10875 struct qstr filename = QSTR_INIT(entry->name, entry->len);
10876 - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10877 + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10878 struct dentry *dentry;
10879 struct dentry *alias;
10880 struct inode *dir = d_inode(parent);
10881 @@ -1443,7 +1443,7 @@
10882 struct file *file, unsigned open_flags,
10883 umode_t mode, int *opened)
10885 - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10886 + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10887 struct nfs_open_context *ctx;
10888 struct dentry *res;
10889 struct iattr attr = { .ia_valid = ATTR_OPEN };
10890 @@ -1763,7 +1763,11 @@
10892 trace_nfs_rmdir_enter(dir, dentry);
10893 if (d_really_is_positive(dentry)) {
10894 +#ifdef CONFIG_PREEMPT_RT_BASE
10895 + down(&NFS_I(d_inode(dentry))->rmdir_sem);
10897 down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
10899 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
10900 /* Ensure the VFS deletes this inode */
10902 @@ -1773,7 +1777,11 @@
10904 nfs_dentry_handle_enoent(dentry);
10906 +#ifdef CONFIG_PREEMPT_RT_BASE
10907 + up(&NFS_I(d_inode(dentry))->rmdir_sem);
10909 up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
10912 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
10913 trace_nfs_rmdir_exit(dir, dentry, error);
10914 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/inode.c linux-4.14/fs/nfs/inode.c
10915 --- linux-4.14.orig/fs/nfs/inode.c 2017-11-12 19:46:13.000000000 +0100
10916 +++ linux-4.14/fs/nfs/inode.c 2018-09-05 11:05:07.000000000 +0200
10917 @@ -2014,7 +2014,11 @@
10918 atomic_long_set(&nfsi->nrequests, 0);
10919 atomic_long_set(&nfsi->commit_info.ncommit, 0);
10920 atomic_set(&nfsi->commit_info.rpcs_out, 0);
10921 +#ifdef CONFIG_PREEMPT_RT_BASE
10922 + sema_init(&nfsi->rmdir_sem, 1);
10924 init_rwsem(&nfsi->rmdir_sem);
10926 mutex_init(&nfsi->commit_mutex);
10927 nfs4_init_once(nfsi);
10929 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/nfs4_fs.h linux-4.14/fs/nfs/nfs4_fs.h
10930 --- linux-4.14.orig/fs/nfs/nfs4_fs.h 2018-09-05 11:03:22.000000000 +0200
10931 +++ linux-4.14/fs/nfs/nfs4_fs.h 2018-09-05 11:05:07.000000000 +0200
10932 @@ -112,7 +112,7 @@
10933 unsigned long so_flags;
10934 struct list_head so_states;
10935 struct nfs_seqid_counter so_seqid;
10936 - seqcount_t so_reclaim_seqcount;
10937 + seqlock_t so_reclaim_seqlock;
10938 struct mutex so_delegreturn_mutex;
10941 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/nfs4proc.c linux-4.14/fs/nfs/nfs4proc.c
10942 --- linux-4.14.orig/fs/nfs/nfs4proc.c 2018-09-05 11:03:22.000000000 +0200
10943 +++ linux-4.14/fs/nfs/nfs4proc.c 2018-09-05 11:05:07.000000000 +0200
10944 @@ -2689,7 +2689,7 @@
10948 - seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
10949 + seq = raw_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
10951 ret = _nfs4_proc_open(opendata);
10953 @@ -2727,7 +2727,7 @@
10955 if (d_inode(dentry) == state->inode) {
10956 nfs_inode_attach_open_context(ctx);
10957 - if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
10958 + if (read_seqretry(&sp->so_reclaim_seqlock, seq))
10959 nfs4_schedule_stateid_recovery(server, state);
10962 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/nfs4state.c linux-4.14/fs/nfs/nfs4state.c
10963 --- linux-4.14.orig/fs/nfs/nfs4state.c 2018-09-05 11:03:22.000000000 +0200
10964 +++ linux-4.14/fs/nfs/nfs4state.c 2018-09-05 11:05:07.000000000 +0200
10965 @@ -494,7 +494,7 @@
10966 nfs4_init_seqid_counter(&sp->so_seqid);
10967 atomic_set(&sp->so_count, 1);
10968 INIT_LIST_HEAD(&sp->so_lru);
10969 - seqcount_init(&sp->so_reclaim_seqcount);
10970 + seqlock_init(&sp->so_reclaim_seqlock);
10971 mutex_init(&sp->so_delegreturn_mutex);
10974 @@ -1519,8 +1519,12 @@
10975 * recovering after a network partition or a reboot from a
10976 * server that doesn't support a grace period.
10978 +#ifdef CONFIG_PREEMPT_RT_FULL
10979 + write_seqlock(&sp->so_reclaim_seqlock);
10981 + write_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
10983 spin_lock(&sp->so_lock);
10984 - raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
10986 list_for_each_entry(state, &sp->so_states, open_states) {
10987 if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
10988 @@ -1589,14 +1593,20 @@
10989 spin_lock(&sp->so_lock);
10992 - raw_write_seqcount_end(&sp->so_reclaim_seqcount);
10993 spin_unlock(&sp->so_lock);
10994 +#ifdef CONFIG_PREEMPT_RT_FULL
10995 + write_sequnlock(&sp->so_reclaim_seqlock);
10997 + write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
11001 nfs4_put_open_state(state);
11002 - spin_lock(&sp->so_lock);
11003 - raw_write_seqcount_end(&sp->so_reclaim_seqcount);
11004 - spin_unlock(&sp->so_lock);
11005 +#ifdef CONFIG_PREEMPT_RT_FULL
11006 + write_sequnlock(&sp->so_reclaim_seqlock);
11008 + write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
11013 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/unlink.c linux-4.14/fs/nfs/unlink.c
11014 --- linux-4.14.orig/fs/nfs/unlink.c 2017-11-12 19:46:13.000000000 +0100
11015 +++ linux-4.14/fs/nfs/unlink.c 2018-09-05 11:05:07.000000000 +0200
11017 #include <linux/sunrpc/clnt.h>
11018 #include <linux/nfs_fs.h>
11019 #include <linux/sched.h>
11020 -#include <linux/wait.h>
11021 +#include <linux/swait.h>
11022 #include <linux/namei.h>
11023 #include <linux/fsnotify.h>
11026 rpc_restart_call_prepare(task);
11029 +#ifdef CONFIG_PREEMPT_RT_BASE
11030 +static void nfs_down_anon(struct semaphore *sema)
11035 +static void nfs_up_anon(struct semaphore *sema)
11041 +static void nfs_down_anon(struct rw_semaphore *rwsem)
11043 + down_read_non_owner(rwsem);
11046 +static void nfs_up_anon(struct rw_semaphore *rwsem)
11048 + up_read_non_owner(rwsem);
11053 * nfs_async_unlink_release - Release the sillydelete data.
11054 * @task: rpc_task of the sillydelete
11056 struct dentry *dentry = data->dentry;
11057 struct super_block *sb = dentry->d_sb;
11059 - up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
11060 + nfs_up_anon(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
11061 d_lookup_done(dentry);
11062 nfs_free_unlinkdata(data);
11064 @@ -118,10 +141,10 @@
11065 struct inode *dir = d_inode(dentry->d_parent);
11066 struct dentry *alias;
11068 - down_read_non_owner(&NFS_I(dir)->rmdir_sem);
11069 + nfs_down_anon(&NFS_I(dir)->rmdir_sem);
11070 alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
11071 if (IS_ERR(alias)) {
11072 - up_read_non_owner(&NFS_I(dir)->rmdir_sem);
11073 + nfs_up_anon(&NFS_I(dir)->rmdir_sem);
11076 if (!d_in_lookup(alias)) {
11077 @@ -143,7 +166,7 @@
11079 spin_unlock(&alias->d_lock);
11081 - up_read_non_owner(&NFS_I(dir)->rmdir_sem);
11082 + nfs_up_anon(&NFS_I(dir)->rmdir_sem);
11084 * If we'd displaced old cached devname, free it. At that
11085 * point dentry is definitely not a root, so we won't need
11086 @@ -183,7 +206,7 @@
11087 goto out_free_name;
11089 data->res.dir_attr = &data->dir_attr;
11090 - init_waitqueue_head(&data->wq);
11091 + init_swait_queue_head(&data->wq);
11094 spin_lock(&dentry->d_lock);
11095 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/ntfs/aops.c linux-4.14/fs/ntfs/aops.c
11096 --- linux-4.14.orig/fs/ntfs/aops.c 2017-11-12 19:46:13.000000000 +0100
11097 +++ linux-4.14/fs/ntfs/aops.c 2018-09-05 11:05:07.000000000 +0200
11098 @@ -93,13 +93,13 @@
11100 if (file_ofs < init_size)
11101 ofs = init_size - file_ofs;
11102 - local_irq_save(flags);
11103 + local_irq_save_nort(flags);
11104 kaddr = kmap_atomic(page);
11105 memset(kaddr + bh_offset(bh) + ofs, 0,
11107 flush_dcache_page(page);
11108 kunmap_atomic(kaddr);
11109 - local_irq_restore(flags);
11110 + local_irq_restore_nort(flags);
11113 clear_buffer_uptodate(bh);
11114 @@ -108,8 +108,7 @@
11115 "0x%llx.", (unsigned long long)bh->b_blocknr);
11117 first = page_buffers(page);
11118 - local_irq_save(flags);
11119 - bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11120 + flags = bh_uptodate_lock_irqsave(first);
11121 clear_buffer_async_read(bh);
11124 @@ -124,8 +123,7 @@
11126 tmp = tmp->b_this_page;
11127 } while (tmp != bh);
11128 - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11129 - local_irq_restore(flags);
11130 + bh_uptodate_unlock_irqrestore(first, flags);
11132 * If none of the buffers had errors then we can set the page uptodate,
11133 * but we first have to perform the post read mst fixups, if the
11134 @@ -146,13 +144,13 @@
11135 recs = PAGE_SIZE / rec_size;
11136 /* Should have been verified before we got here... */
11138 - local_irq_save(flags);
11139 + local_irq_save_nort(flags);
11140 kaddr = kmap_atomic(page);
11141 for (i = 0; i < recs; i++)
11142 post_read_mst_fixup((NTFS_RECORD*)(kaddr +
11143 i * rec_size), rec_size);
11144 kunmap_atomic(kaddr);
11145 - local_irq_restore(flags);
11146 + local_irq_restore_nort(flags);
11147 flush_dcache_page(page);
11148 if (likely(page_uptodate && !PageError(page)))
11149 SetPageUptodate(page);
11150 @@ -160,9 +158,7 @@
11154 - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11155 - local_irq_restore(flags);
11157 + bh_uptodate_unlock_irqrestore(first, flags);
11161 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/proc/array.c linux-4.14/fs/proc/array.c
11162 --- linux-4.14.orig/fs/proc/array.c 2018-09-05 11:03:22.000000000 +0200
11163 +++ linux-4.14/fs/proc/array.c 2018-09-05 11:05:07.000000000 +0200
11164 @@ -386,9 +386,9 @@
11165 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
11167 seq_printf(m, "Cpus_allowed:\t%*pb\n",
11168 - cpumask_pr_args(&task->cpus_allowed));
11169 + cpumask_pr_args(task->cpus_ptr));
11170 seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
11171 - cpumask_pr_args(&task->cpus_allowed));
11172 + cpumask_pr_args(task->cpus_ptr));
11175 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
11176 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/proc/base.c linux-4.14/fs/proc/base.c
11177 --- linux-4.14.orig/fs/proc/base.c 2018-09-05 11:03:28.000000000 +0200
11178 +++ linux-4.14/fs/proc/base.c 2018-09-05 11:05:07.000000000 +0200
11179 @@ -1886,7 +1886,7 @@
11181 child = d_hash_and_lookup(dir, &qname);
11183 - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11184 + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11185 child = d_alloc_parallel(dir, &qname, &wq);
11187 goto end_instantiate;
11188 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/proc/proc_sysctl.c linux-4.14/fs/proc/proc_sysctl.c
11189 --- linux-4.14.orig/fs/proc/proc_sysctl.c 2018-09-05 11:03:22.000000000 +0200
11190 +++ linux-4.14/fs/proc/proc_sysctl.c 2018-09-05 11:05:07.000000000 +0200
11191 @@ -679,7 +679,7 @@
11193 child = d_lookup(dir, &qname);
11195 - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11196 + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11197 child = d_alloc_parallel(dir, &qname, &wq);
11200 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/timerfd.c linux-4.14/fs/timerfd.c
11201 --- linux-4.14.orig/fs/timerfd.c 2017-11-12 19:46:13.000000000 +0100
11202 +++ linux-4.14/fs/timerfd.c 2018-09-05 11:05:07.000000000 +0200
11203 @@ -471,7 +471,10 @@
11206 spin_unlock_irq(&ctx->wqh.lock);
11208 + if (isalarm(ctx))
11209 + hrtimer_wait_for_timer(&ctx->t.alarm.timer);
11211 + hrtimer_wait_for_timer(&ctx->t.tmr);
11215 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/xfs/xfs_aops.c linux-4.14/fs/xfs/xfs_aops.c
11216 --- linux-4.14.orig/fs/xfs/xfs_aops.c 2018-09-05 11:03:22.000000000 +0200
11217 +++ linux-4.14/fs/xfs/xfs_aops.c 2018-09-05 11:05:07.000000000 +0200
11218 @@ -120,8 +120,7 @@
11219 ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
11220 ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
11222 - local_irq_save(flags);
11223 - bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
11224 + flags = bh_uptodate_lock_irqsave(head);
11226 if (off >= bvec->bv_offset &&
11227 off < bvec->bv_offset + bvec->bv_len) {
11228 @@ -143,8 +142,7 @@
11231 } while ((bh = bh->b_this_page) != head);
11232 - bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
11233 - local_irq_restore(flags);
11234 + bh_uptodate_unlock_irqrestore(head, flags);
11237 end_page_writeback(bvec->bv_page);
11238 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/acpi/platform/aclinux.h linux-4.14/include/acpi/platform/aclinux.h
11239 --- linux-4.14.orig/include/acpi/platform/aclinux.h 2017-11-12 19:46:13.000000000 +0100
11240 +++ linux-4.14/include/acpi/platform/aclinux.h 2018-09-05 11:05:07.000000000 +0200
11241 @@ -134,6 +134,7 @@
11243 #define acpi_cache_t struct kmem_cache
11244 #define acpi_spinlock spinlock_t *
11245 +#define acpi_raw_spinlock raw_spinlock_t *
11246 #define acpi_cpu_flags unsigned long
11248 /* Use native linux version of acpi_os_allocate_zeroed */
11249 @@ -152,6 +153,20 @@
11250 #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
11251 #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
11253 +#define acpi_os_create_raw_lock(__handle) \
11255 + raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock)); \
11258 + *(__handle) = lock; \
11259 + raw_spin_lock_init(*(__handle)); \
11261 + lock ? AE_OK : AE_NO_MEMORY; \
11264 +#define acpi_os_delete_raw_lock(__handle) kfree(__handle)
11268 * OSL interfaces used by debugger/disassembler
11270 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/asm-generic/bug.h linux-4.14/include/asm-generic/bug.h
11271 --- linux-4.14.orig/include/asm-generic/bug.h 2018-09-05 11:03:22.000000000 +0200
11272 +++ linux-4.14/include/asm-generic/bug.h 2018-09-05 11:05:07.000000000 +0200
11273 @@ -234,6 +234,20 @@
11274 # define WARN_ON_SMP(x) ({0;})
11277 +#ifdef CONFIG_PREEMPT_RT_BASE
11278 +# define BUG_ON_RT(c) BUG_ON(c)
11279 +# define BUG_ON_NONRT(c) do { } while (0)
11280 +# define WARN_ON_RT(condition) WARN_ON(condition)
11281 +# define WARN_ON_NONRT(condition) do { } while (0)
11282 +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
11284 +# define BUG_ON_RT(c) do { } while (0)
11285 +# define BUG_ON_NONRT(c) BUG_ON(c)
11286 +# define WARN_ON_RT(condition) do { } while (0)
11287 +# define WARN_ON_NONRT(condition) WARN_ON(condition)
11288 +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
11291 #endif /* __ASSEMBLY__ */
11294 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/blkdev.h linux-4.14/include/linux/blkdev.h
11295 --- linux-4.14.orig/include/linux/blkdev.h 2018-09-05 11:03:22.000000000 +0200
11296 +++ linux-4.14/include/linux/blkdev.h 2018-09-05 11:05:07.000000000 +0200
11298 #include <linux/percpu-refcount.h>
11299 #include <linux/scatterlist.h>
11300 #include <linux/blkzoned.h>
11301 +#include <linux/swork.h>
11304 struct scsi_ioctl_command;
11305 @@ -134,6 +135,9 @@
11308 struct list_head queuelist;
11309 +#ifdef CONFIG_PREEMPT_RT_FULL
11310 + struct work_struct work;
11313 struct __call_single_data csd;
11315 @@ -596,6 +600,7 @@
11317 struct rcu_head rcu_head;
11318 wait_queue_head_t mq_freeze_wq;
11319 + struct swork_event mq_pcpu_wake;
11320 struct percpu_ref q_usage_counter;
11321 struct list_head all_q_node;
11323 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/blk-mq.h linux-4.14/include/linux/blk-mq.h
11324 --- linux-4.14.orig/include/linux/blk-mq.h 2017-11-12 19:46:13.000000000 +0100
11325 +++ linux-4.14/include/linux/blk-mq.h 2018-09-05 11:05:07.000000000 +0200
11326 @@ -226,7 +226,7 @@
11327 return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
11331 +void __blk_mq_complete_request_remote_work(struct work_struct *work);
11332 int blk_mq_request_started(struct request *rq);
11333 void blk_mq_start_request(struct request *rq);
11334 void blk_mq_end_request(struct request *rq, blk_status_t error);
11335 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/bottom_half.h linux-4.14/include/linux/bottom_half.h
11336 --- linux-4.14.orig/include/linux/bottom_half.h 2017-11-12 19:46:13.000000000 +0100
11337 +++ linux-4.14/include/linux/bottom_half.h 2018-09-05 11:05:07.000000000 +0200
11340 #include <linux/preempt.h>
11342 +#ifdef CONFIG_PREEMPT_RT_FULL
11344 +extern void __local_bh_disable(void);
11345 +extern void _local_bh_enable(void);
11346 +extern void __local_bh_enable(void);
11348 +static inline void local_bh_disable(void)
11350 + __local_bh_disable();
11353 +static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
11355 + __local_bh_disable();
11358 +static inline void local_bh_enable(void)
11360 + __local_bh_enable();
11363 +static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
11365 + __local_bh_enable();
11368 +static inline void local_bh_enable_ip(unsigned long ip)
11370 + __local_bh_enable();
11375 #ifdef CONFIG_TRACE_IRQFLAGS
11376 extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
11380 __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
11384 #endif /* _LINUX_BH_H */
11385 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/buffer_head.h linux-4.14/include/linux/buffer_head.h
11386 --- linux-4.14.orig/include/linux/buffer_head.h 2017-11-12 19:46:13.000000000 +0100
11387 +++ linux-4.14/include/linux/buffer_head.h 2018-09-05 11:05:07.000000000 +0200
11389 struct address_space *b_assoc_map; /* mapping this buffer is
11391 atomic_t b_count; /* users using this buffer_head */
11392 +#ifdef CONFIG_PREEMPT_RT_BASE
11393 + spinlock_t b_uptodate_lock;
11394 +#if IS_ENABLED(CONFIG_JBD2)
11395 + spinlock_t b_state_lock;
11396 + spinlock_t b_journal_head_lock;
11401 +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
11403 + unsigned long flags;
11405 +#ifndef CONFIG_PREEMPT_RT_BASE
11406 + local_irq_save(flags);
11407 + bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
11409 + spin_lock_irqsave(&bh->b_uptodate_lock, flags);
11414 +static inline void
11415 +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
11417 +#ifndef CONFIG_PREEMPT_RT_BASE
11418 + bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
11419 + local_irq_restore(flags);
11421 + spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
11425 +static inline void buffer_head_init_locks(struct buffer_head *bh)
11427 +#ifdef CONFIG_PREEMPT_RT_BASE
11428 + spin_lock_init(&bh->b_uptodate_lock);
11429 +#if IS_ENABLED(CONFIG_JBD2)
11430 + spin_lock_init(&bh->b_state_lock);
11431 + spin_lock_init(&bh->b_journal_head_lock);
11437 * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
11438 * and buffer_foo() functions.
11439 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/cgroup-defs.h linux-4.14/include/linux/cgroup-defs.h
11440 --- linux-4.14.orig/include/linux/cgroup-defs.h 2018-09-05 11:03:22.000000000 +0200
11441 +++ linux-4.14/include/linux/cgroup-defs.h 2018-09-05 11:05:07.000000000 +0200
11443 #include <linux/percpu-rwsem.h>
11444 #include <linux/workqueue.h>
11445 #include <linux/bpf-cgroup.h>
11446 +#include <linux/swork.h>
11448 #ifdef CONFIG_CGROUPS
11450 @@ -152,6 +153,7 @@
11451 /* percpu_ref killing and RCU release */
11452 struct rcu_head rcu_head;
11453 struct work_struct destroy_work;
11454 + struct swork_event destroy_swork;
11457 * PI: the parent css. Placed here for cache proximity to following
11458 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/completion.h linux-4.14/include/linux/completion.h
11459 --- linux-4.14.orig/include/linux/completion.h 2017-11-12 19:46:13.000000000 +0100
11460 +++ linux-4.14/include/linux/completion.h 2018-09-05 11:05:07.000000000 +0200
11462 * See kernel/sched/completion.c for details.
11465 -#include <linux/wait.h>
11466 +#include <linux/swait.h>
11467 #ifdef CONFIG_LOCKDEP_COMPLETIONS
11468 #include <linux/lockdep.h>
11472 struct completion {
11474 - wait_queue_head_t wait;
11475 + struct swait_queue_head wait;
11476 #ifdef CONFIG_LOCKDEP_COMPLETIONS
11477 struct lockdep_map_cross map;
11479 @@ -67,11 +67,11 @@
11481 #ifdef CONFIG_LOCKDEP_COMPLETIONS
11482 #define COMPLETION_INITIALIZER(work) \
11483 - { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
11484 + { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
11485 STATIC_CROSS_LOCKDEP_MAP_INIT("(complete)" #work, &(work)) }
11487 #define COMPLETION_INITIALIZER(work) \
11488 - { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
11489 + { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
11492 #define COMPLETION_INITIALIZER_ONSTACK(work) \
11493 @@ -117,7 +117,7 @@
11494 static inline void __init_completion(struct completion *x)
11497 - init_waitqueue_head(&x->wait);
11498 + init_swait_queue_head(&x->wait);
11502 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/cpu.h linux-4.14/include/linux/cpu.h
11503 --- linux-4.14.orig/include/linux/cpu.h 2018-09-05 11:03:22.000000000 +0200
11504 +++ linux-4.14/include/linux/cpu.h 2018-09-05 11:05:07.000000000 +0200
11505 @@ -120,6 +120,8 @@
11506 extern void cpu_hotplug_enable(void);
11507 void clear_tasks_mm_cpumask(int cpu);
11508 int cpu_down(unsigned int cpu);
11509 +extern void pin_current_cpu(void);
11510 +extern void unpin_current_cpu(void);
11512 #else /* CONFIG_HOTPLUG_CPU */
11514 @@ -130,6 +132,9 @@
11515 static inline void lockdep_assert_cpus_held(void) { }
11516 static inline void cpu_hotplug_disable(void) { }
11517 static inline void cpu_hotplug_enable(void) { }
11518 +static inline void pin_current_cpu(void) { }
11519 +static inline void unpin_current_cpu(void) { }
11521 #endif /* !CONFIG_HOTPLUG_CPU */
11523 /* Wrappers which go away once all code is converted */
11524 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/dcache.h linux-4.14/include/linux/dcache.h
11525 --- linux-4.14.orig/include/linux/dcache.h 2018-09-05 11:03:22.000000000 +0200
11526 +++ linux-4.14/include/linux/dcache.h 2018-09-05 11:05:07.000000000 +0200
11527 @@ -107,7 +107,7 @@
11530 struct list_head d_lru; /* LRU list */
11531 - wait_queue_head_t *d_wait; /* in-lookup ones only */
11532 + struct swait_queue_head *d_wait; /* in-lookup ones only */
11534 struct list_head d_child; /* child of parent list */
11535 struct list_head d_subdirs; /* our children */
11536 @@ -238,7 +238,7 @@
11537 extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
11538 extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
11539 extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
11540 - wait_queue_head_t *);
11541 + struct swait_queue_head *);
11542 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
11543 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
11544 extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
11545 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/delay.h linux-4.14/include/linux/delay.h
11546 --- linux-4.14.orig/include/linux/delay.h 2017-11-12 19:46:13.000000000 +0100
11547 +++ linux-4.14/include/linux/delay.h 2018-09-05 11:05:07.000000000 +0200
11549 msleep(seconds * 1000);
11552 +#ifdef CONFIG_PREEMPT_RT_FULL
11553 +extern void cpu_chill(void);
11555 +# define cpu_chill() cpu_relax()
11558 #endif /* defined(_LINUX_DELAY_H) */
11559 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/fs.h linux-4.14/include/linux/fs.h
11560 --- linux-4.14.orig/include/linux/fs.h 2018-09-05 11:03:29.000000000 +0200
11561 +++ linux-4.14/include/linux/fs.h 2018-09-05 11:05:07.000000000 +0200
11562 @@ -655,7 +655,7 @@
11563 struct block_device *i_bdev;
11564 struct cdev *i_cdev;
11566 - unsigned i_dir_seq;
11567 + unsigned __i_dir_seq;
11570 __u32 i_generation;
11571 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/highmem.h linux-4.14/include/linux/highmem.h
11572 --- linux-4.14.orig/include/linux/highmem.h 2017-11-12 19:46:13.000000000 +0100
11573 +++ linux-4.14/include/linux/highmem.h 2018-09-05 11:05:07.000000000 +0200
11575 #include <linux/mm.h>
11576 #include <linux/uaccess.h>
11577 #include <linux/hardirq.h>
11578 +#include <linux/sched.h>
11580 #include <asm/cacheflush.h>
11584 static inline void *kmap_atomic(struct page *page)
11586 - preempt_disable();
11587 + preempt_disable_nort();
11588 pagefault_disable();
11589 return page_address(page);
11592 static inline void __kunmap_atomic(void *addr)
11594 pagefault_enable();
11595 - preempt_enable();
11596 + preempt_enable_nort();
11599 #define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn))
11600 @@ -87,32 +88,51 @@
11602 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
11604 +#ifndef CONFIG_PREEMPT_RT_FULL
11605 DECLARE_PER_CPU(int, __kmap_atomic_idx);
11608 static inline int kmap_atomic_idx_push(void)
11610 +#ifndef CONFIG_PREEMPT_RT_FULL
11611 int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
11613 -#ifdef CONFIG_DEBUG_HIGHMEM
11614 +# ifdef CONFIG_DEBUG_HIGHMEM
11615 WARN_ON_ONCE(in_irq() && !irqs_disabled());
11616 BUG_ON(idx >= KM_TYPE_NR);
11621 + current->kmap_idx++;
11622 + BUG_ON(current->kmap_idx > KM_TYPE_NR);
11623 + return current->kmap_idx - 1;
11627 static inline int kmap_atomic_idx(void)
11629 +#ifndef CONFIG_PREEMPT_RT_FULL
11630 return __this_cpu_read(__kmap_atomic_idx) - 1;
11632 + return current->kmap_idx - 1;
11636 static inline void kmap_atomic_idx_pop(void)
11638 -#ifdef CONFIG_DEBUG_HIGHMEM
11639 +#ifndef CONFIG_PREEMPT_RT_FULL
11640 +# ifdef CONFIG_DEBUG_HIGHMEM
11641 int idx = __this_cpu_dec_return(__kmap_atomic_idx);
11646 __this_cpu_dec(__kmap_atomic_idx);
11649 + current->kmap_idx--;
11650 +# ifdef CONFIG_DEBUG_HIGHMEM
11651 + BUG_ON(current->kmap_idx < 0);
11656 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/hrtimer.h linux-4.14/include/linux/hrtimer.h
11657 --- linux-4.14.orig/include/linux/hrtimer.h 2017-11-12 19:46:13.000000000 +0100
11658 +++ linux-4.14/include/linux/hrtimer.h 2018-09-05 11:05:07.000000000 +0200
11659 @@ -22,19 +22,42 @@
11660 #include <linux/percpu.h>
11661 #include <linux/timer.h>
11662 #include <linux/timerqueue.h>
11663 +#include <linux/wait.h>
11665 struct hrtimer_clock_base;
11666 struct hrtimer_cpu_base;
11669 * Mode arguments of xxx_hrtimer functions:
11671 + * HRTIMER_MODE_ABS - Time value is absolute
11672 + * HRTIMER_MODE_REL - Time value is relative to now
11673 + * HRTIMER_MODE_PINNED - Timer is bound to CPU (is only considered
11674 + * when starting the timer)
11675 + * HRTIMER_MODE_SOFT - Timer callback function will be executed in
11676 + * soft irq context
11678 enum hrtimer_mode {
11679 - HRTIMER_MODE_ABS = 0x0, /* Time value is absolute */
11680 - HRTIMER_MODE_REL = 0x1, /* Time value is relative to now */
11681 - HRTIMER_MODE_PINNED = 0x02, /* Timer is bound to CPU */
11682 - HRTIMER_MODE_ABS_PINNED = 0x02,
11683 - HRTIMER_MODE_REL_PINNED = 0x03,
11684 + HRTIMER_MODE_ABS = 0x00,
11685 + HRTIMER_MODE_REL = 0x01,
11686 + HRTIMER_MODE_PINNED = 0x02,
11687 + HRTIMER_MODE_SOFT = 0x04,
11688 + HRTIMER_MODE_HARD = 0x08,
11690 + HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED,
11691 + HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED,
11693 + HRTIMER_MODE_ABS_SOFT = HRTIMER_MODE_ABS | HRTIMER_MODE_SOFT,
11694 + HRTIMER_MODE_REL_SOFT = HRTIMER_MODE_REL | HRTIMER_MODE_SOFT,
11696 + HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT,
11697 + HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT,
11699 + HRTIMER_MODE_ABS_HARD = HRTIMER_MODE_ABS | HRTIMER_MODE_HARD,
11700 + HRTIMER_MODE_REL_HARD = HRTIMER_MODE_REL | HRTIMER_MODE_HARD,
11702 + HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD,
11703 + HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD,
11708 * @base: pointer to the timer base (per cpu and per clock)
11709 * @state: state information (See bit values above)
11710 * @is_rel: Set if the timer was armed relative
11711 + * @is_soft: Set if hrtimer will be expired in soft interrupt context.
11713 * The hrtimer structure must be initialized by hrtimer_init()
11716 struct hrtimer_clock_base *base;
11723 @@ -112,9 +137,9 @@
11726 #ifdef CONFIG_64BIT
11727 -# define HRTIMER_CLOCK_BASE_ALIGN 64
11728 +# define __hrtimer_clock_base_align ____cacheline_aligned
11730 -# define HRTIMER_CLOCK_BASE_ALIGN 32
11731 +# define __hrtimer_clock_base_align
11735 @@ -123,48 +148,57 @@
11736 * @index: clock type index for per_cpu support when moving a
11737 * timer to a base on another cpu.
11738 * @clockid: clock id for per_cpu support
11739 + * @seq: seqcount around __run_hrtimer
11740 + * @running: pointer to the currently running hrtimer
11741 * @active: red black tree root node for the active timers
11742 * @get_time: function to retrieve the current time of the clock
11743 * @offset: offset of this clock to the monotonic base
11745 struct hrtimer_clock_base {
11746 struct hrtimer_cpu_base *cpu_base;
11748 + unsigned int index;
11751 + struct hrtimer *running;
11752 struct timerqueue_head active;
11753 ktime_t (*get_time)(void);
11755 -} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
11756 +} __hrtimer_clock_base_align;
11758 enum hrtimer_base_type {
11759 HRTIMER_BASE_MONOTONIC,
11760 HRTIMER_BASE_REALTIME,
11761 HRTIMER_BASE_BOOTTIME,
11763 + HRTIMER_BASE_MONOTONIC_SOFT,
11764 + HRTIMER_BASE_REALTIME_SOFT,
11765 + HRTIMER_BASE_BOOTTIME_SOFT,
11766 + HRTIMER_BASE_TAI_SOFT,
11767 HRTIMER_MAX_CLOCK_BASES,
11772 * struct hrtimer_cpu_base - the per cpu clock bases
11773 * @lock: lock protecting the base and associated clock bases
11775 - * @seq: seqcount around __run_hrtimer
11776 - * @running: pointer to the currently running hrtimer
11778 * @active_bases: Bitfield to mark bases with active timers
11779 * @clock_was_set_seq: Sequence counter of clock was set events
11780 - * @migration_enabled: The migration of hrtimers to other cpus is enabled
11781 - * @nohz_active: The nohz functionality is enabled
11782 - * @expires_next: absolute time of the next event which was scheduled
11783 - * via clock_set_next_event()
11784 - * @next_timer: Pointer to the first expiring timer
11785 - * @in_hrtirq: hrtimer_interrupt() is currently executing
11786 * @hres_active: State of high resolution mode
11787 + * @in_hrtirq: hrtimer_interrupt() is currently executing
11788 * @hang_detected: The last hrtimer interrupt detected a hang
11789 + * @softirq_activated: displays, if the softirq is raised - update of softirq
11790 + * related settings is not required then.
11791 * @nr_events: Total number of hrtimer interrupt events
11792 * @nr_retries: Total number of hrtimer interrupt retries
11793 * @nr_hangs: Total number of hrtimer interrupt hangs
11794 * @max_hang_time: Maximum time spent in hrtimer_interrupt
11795 + * @expires_next: absolute time of the next event, is required for remote
11796 + * hrtimer enqueue; it is the total first expiry time (hard
11797 + * and soft hrtimer are taken into account)
11798 + * @next_timer: Pointer to the first expiring timer
11799 + * @softirq_expires_next: Time to check, if soft queues needs also to be expired
11800 + * @softirq_next_timer: Pointer to the first expiring softirq based timer
11801 * @clock_base: array of clock bases for this cpu
11803 * Note: next_timer is just an optimization for __remove_hrtimer().
11804 @@ -173,31 +207,31 @@
11806 struct hrtimer_cpu_base {
11807 raw_spinlock_t lock;
11809 - struct hrtimer *running;
11811 unsigned int active_bases;
11812 unsigned int clock_was_set_seq;
11813 - bool migration_enabled;
11814 - bool nohz_active;
11815 + unsigned int hres_active : 1,
11817 + hang_detected : 1,
11818 + softirq_activated : 1;
11819 #ifdef CONFIG_HIGH_RES_TIMERS
11820 - unsigned int in_hrtirq : 1,
11822 - hang_detected : 1;
11823 - ktime_t expires_next;
11824 - struct hrtimer *next_timer;
11825 unsigned int nr_events;
11826 - unsigned int nr_retries;
11827 - unsigned int nr_hangs;
11828 + unsigned short nr_retries;
11829 + unsigned short nr_hangs;
11830 unsigned int max_hang_time;
11832 + ktime_t expires_next;
11833 + struct hrtimer *next_timer;
11834 + ktime_t softirq_expires_next;
11835 +#ifdef CONFIG_PREEMPT_RT_BASE
11836 + wait_queue_head_t wait;
11838 + struct hrtimer *softirq_next_timer;
11839 struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
11840 } ____cacheline_aligned;
11842 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
11844 - BUILD_BUG_ON(sizeof(struct hrtimer_clock_base) > HRTIMER_CLOCK_BASE_ALIGN);
11846 timer->node.expires = time;
11847 timer->_softexpires = time;
11849 @@ -266,16 +300,17 @@
11850 return timer->base->get_time();
11853 +static inline int hrtimer_is_hres_active(struct hrtimer *timer)
11855 + return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
11856 + timer->base->cpu_base->hres_active : 0;
11859 #ifdef CONFIG_HIGH_RES_TIMERS
11860 struct clock_event_device;
11862 extern void hrtimer_interrupt(struct clock_event_device *dev);
11864 -static inline int hrtimer_is_hres_active(struct hrtimer *timer)
11866 - return timer->base->cpu_base->hres_active;
11870 * The resolution of the clocks. The resolution value is returned in
11871 * the clock_getres() system call to give application programmers an
11872 @@ -298,11 +333,6 @@
11874 #define hrtimer_resolution (unsigned int)LOW_RES_NSEC
11876 -static inline int hrtimer_is_hres_active(struct hrtimer *timer)
11881 static inline void clock_was_set_delayed(void) { }
11884 @@ -344,10 +374,17 @@
11885 /* Initialize timers: */
11886 extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock,
11887 enum hrtimer_mode mode);
11888 +extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
11889 + enum hrtimer_mode mode,
11890 + struct task_struct *task);
11892 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
11893 extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock,
11894 enum hrtimer_mode mode);
11895 +extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
11896 + clockid_t clock_id,
11897 + enum hrtimer_mode mode,
11898 + struct task_struct *task);
11900 extern void destroy_hrtimer_on_stack(struct hrtimer *timer);
11902 @@ -357,6 +394,15 @@
11904 hrtimer_init(timer, which_clock, mode);
11907 +static inline void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
11908 + clockid_t clock_id,
11909 + enum hrtimer_mode mode,
11910 + struct task_struct *task)
11912 + hrtimer_init_sleeper(sl, clock_id, mode, task);
11915 static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
11918 @@ -365,11 +411,12 @@
11919 u64 range_ns, const enum hrtimer_mode mode);
11922 - * hrtimer_start - (re)start an hrtimer on the current CPU
11923 + * hrtimer_start - (re)start an hrtimer
11924 * @timer: the timer to be added
11925 * @tim: expiry time
11926 - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
11927 - * relative (HRTIMER_MODE_REL)
11928 + * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or
11929 + * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
11930 + * softirq based mode is considered for debug purpose only!
11932 static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim,
11933 const enum hrtimer_mode mode)
11934 @@ -396,6 +443,13 @@
11935 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
11938 +/* Softirq preemption could deadlock timer removal */
11939 +#ifdef CONFIG_PREEMPT_RT_BASE
11940 + extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
11942 +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
11945 /* Query timers: */
11946 extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
11948 @@ -420,9 +474,9 @@
11949 * Helper function to check, whether the timer is running the callback
11952 -static inline int hrtimer_callback_running(struct hrtimer *timer)
11953 +static inline int hrtimer_callback_running(const struct hrtimer *timer)
11955 - return timer->base->cpu_base->running == timer;
11956 + return timer->base->running == timer;
11959 /* Forward a hrtimer so it expires after now: */
11960 @@ -458,15 +512,12 @@
11961 const enum hrtimer_mode mode,
11962 const clockid_t clockid);
11964 -extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
11965 - struct task_struct *tsk);
11967 extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta,
11968 const enum hrtimer_mode mode);
11969 extern int schedule_hrtimeout_range_clock(ktime_t *expires,
11971 const enum hrtimer_mode mode,
11973 + clockid_t clock_id);
11974 extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
11976 /* Soft interrupt function to run the hrtimer queues: */
11977 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/idr.h linux-4.14/include/linux/idr.h
11978 --- linux-4.14.orig/include/linux/idr.h 2017-11-12 19:46:13.000000000 +0100
11979 +++ linux-4.14/include/linux/idr.h 2018-09-05 11:05:07.000000000 +0200
11980 @@ -167,10 +167,7 @@
11981 * Each idr_preload() should be matched with an invocation of this
11982 * function. See idr_preload() for details.
11984 -static inline void idr_preload_end(void)
11986 - preempt_enable();
11988 +void idr_preload_end(void);
11991 * idr_find - return pointer for given id
11992 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/init_task.h linux-4.14/include/linux/init_task.h
11993 --- linux-4.14.orig/include/linux/init_task.h 2017-11-12 19:46:13.000000000 +0100
11994 +++ linux-4.14/include/linux/init_task.h 2018-09-05 11:05:07.000000000 +0200
11995 @@ -163,6 +163,12 @@
11996 # define INIT_PERF_EVENTS(tsk)
11999 +#if defined(CONFIG_POSIX_TIMERS) && defined(CONFIG_PREEMPT_RT_BASE)
12000 +# define INIT_TIMER_LIST .posix_timer_list = NULL,
12002 +# define INIT_TIMER_LIST
12005 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
12006 # define INIT_VTIME(tsk) \
12007 .vtime.seqcount = SEQCNT_ZERO(tsk.vtime.seqcount), \
12008 @@ -234,7 +240,8 @@
12009 .static_prio = MAX_PRIO-20, \
12010 .normal_prio = MAX_PRIO-20, \
12011 .policy = SCHED_NORMAL, \
12012 - .cpus_allowed = CPU_MASK_ALL, \
12013 + .cpus_ptr = &tsk.cpus_mask, \
12014 + .cpus_mask = CPU_MASK_ALL, \
12015 .nr_cpus_allowed= NR_CPUS, \
12017 .active_mm = &init_mm, \
12018 @@ -276,6 +283,7 @@
12019 INIT_CPU_TIMERS(tsk) \
12020 .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \
12021 .timer_slack_ns = 50000, /* 50 usec default slack */ \
12022 + INIT_TIMER_LIST \
12024 [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \
12025 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \
12026 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/interrupt.h linux-4.14/include/linux/interrupt.h
12027 --- linux-4.14.orig/include/linux/interrupt.h 2018-09-05 11:03:22.000000000 +0200
12028 +++ linux-4.14/include/linux/interrupt.h 2018-09-05 11:05:07.000000000 +0200
12030 #include <linux/hrtimer.h>
12031 #include <linux/kref.h>
12032 #include <linux/workqueue.h>
12033 +#include <linux/swork.h>
12035 #include <linux/atomic.h>
12036 #include <asm/ptrace.h>
12038 * interrupt handler after suspending interrupts. For system
12039 * wakeup devices users need to implement wakeup detection in
12040 * their interrupt handlers.
12041 + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
12043 #define IRQF_SHARED 0x00000080
12044 #define IRQF_PROBE_SHARED 0x00000100
12046 #define IRQF_NO_THREAD 0x00010000
12047 #define IRQF_EARLY_RESUME 0x00020000
12048 #define IRQF_COND_SUSPEND 0x00040000
12049 +#define IRQF_NO_SOFTIRQ_CALL 0x00080000
12051 #define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
12053 @@ -207,7 +210,7 @@
12054 #ifdef CONFIG_LOCKDEP
12055 # define local_irq_enable_in_hardirq() do { } while (0)
12057 -# define local_irq_enable_in_hardirq() local_irq_enable()
12058 +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
12061 extern void disable_irq_nosync(unsigned int irq);
12062 @@ -227,6 +230,7 @@
12063 * struct irq_affinity_notify - context for notification of IRQ affinity changes
12064 * @irq: Interrupt to which notification applies
12065 * @kref: Reference count, for internal use
12066 + * @swork: Swork item, for internal use
12067 * @work: Work item, for internal use
12068 * @notify: Function to be called on change. This will be
12069 * called in process context.
12070 @@ -238,7 +242,11 @@
12071 struct irq_affinity_notify {
12074 +#ifdef CONFIG_PREEMPT_RT_BASE
12075 + struct swork_event swork;
12077 struct work_struct work;
12079 void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
12080 void (*release)(struct kref *ref);
12082 @@ -429,9 +437,13 @@
12085 #ifdef CONFIG_IRQ_FORCED_THREADING
12086 +# ifndef CONFIG_PREEMPT_RT_BASE
12087 extern bool force_irqthreads;
12089 +# define force_irqthreads (true)
12092 -#define force_irqthreads (0)
12093 +#define force_irqthreads (false)
12096 #ifndef __ARCH_SET_SOFTIRQ_PENDING
12097 @@ -488,9 +500,10 @@
12098 void (*action)(struct softirq_action *);
12101 +#ifndef CONFIG_PREEMPT_RT_FULL
12102 asmlinkage void do_softirq(void);
12103 asmlinkage void __do_softirq(void);
12105 +static inline void thread_do_softirq(void) { do_softirq(); }
12106 #ifdef __ARCH_HAS_DO_SOFTIRQ
12107 void do_softirq_own_stack(void);
12109 @@ -499,13 +512,25 @@
12114 +extern void thread_do_softirq(void);
12117 extern void open_softirq(int nr, void (*action)(struct softirq_action *));
12118 extern void softirq_init(void);
12119 extern void __raise_softirq_irqoff(unsigned int nr);
12120 +#ifdef CONFIG_PREEMPT_RT_FULL
12121 +extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
12123 +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
12125 + __raise_softirq_irqoff(nr);
12129 extern void raise_softirq_irqoff(unsigned int nr);
12130 extern void raise_softirq(unsigned int nr);
12131 +extern void softirq_check_pending_idle(void);
12133 DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
12135 @@ -527,8 +552,9 @@
12136 to be executed on some cpu at least once after this.
12137 * If the tasklet is already scheduled, but its execution is still not
12138 started, it will be executed only once.
12139 - * If this tasklet is already running on another CPU (or schedule is called
12140 - from tasklet itself), it is rescheduled for later.
12141 + * If this tasklet is already running on another CPU, it is rescheduled
12143 + * Schedule must not be called from the tasklet itself (a lockup occurs)
12144 * Tasklet is strictly serialized wrt itself, but not
12145 wrt another tasklets. If client needs some intertask synchronization,
12146 he makes it with spinlocks.
12147 @@ -553,27 +579,36 @@
12150 TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */
12151 - TASKLET_STATE_RUN /* Tasklet is running (SMP only) */
12152 + TASKLET_STATE_RUN, /* Tasklet is running (SMP only) */
12153 + TASKLET_STATE_PENDING /* Tasklet is pending */
12157 +#define TASKLET_STATEF_SCHED (1 << TASKLET_STATE_SCHED)
12158 +#define TASKLET_STATEF_RUN (1 << TASKLET_STATE_RUN)
12159 +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
12161 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
12162 static inline int tasklet_trylock(struct tasklet_struct *t)
12164 return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
12167 +static inline int tasklet_tryunlock(struct tasklet_struct *t)
12169 + return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
12172 static inline void tasklet_unlock(struct tasklet_struct *t)
12174 smp_mb__before_atomic();
12175 clear_bit(TASKLET_STATE_RUN, &(t)->state);
12178 -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
12180 - while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
12182 +extern void tasklet_unlock_wait(struct tasklet_struct *t);
12185 #define tasklet_trylock(t) 1
12186 +#define tasklet_tryunlock(t) 1
12187 #define tasklet_unlock_wait(t) do { } while (0)
12188 #define tasklet_unlock(t) do { } while (0)
12190 @@ -607,41 +642,17 @@
12194 -static inline void tasklet_enable(struct tasklet_struct *t)
12196 - smp_mb__before_atomic();
12197 - atomic_dec(&t->count);
12200 +extern void tasklet_enable(struct tasklet_struct *t);
12201 extern void tasklet_kill(struct tasklet_struct *t);
12202 extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
12203 extern void tasklet_init(struct tasklet_struct *t,
12204 void (*func)(unsigned long), unsigned long data);
12206 -struct tasklet_hrtimer {
12207 - struct hrtimer timer;
12208 - struct tasklet_struct tasklet;
12209 - enum hrtimer_restart (*function)(struct hrtimer *);
12213 -tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
12214 - enum hrtimer_restart (*function)(struct hrtimer *),
12215 - clockid_t which_clock, enum hrtimer_mode mode);
12218 -void tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time,
12219 - const enum hrtimer_mode mode)
12221 - hrtimer_start(&ttimer->timer, time, mode);
12225 -void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
12227 - hrtimer_cancel(&ttimer->timer);
12228 - tasklet_kill(&ttimer->tasklet);
12230 +#ifdef CONFIG_PREEMPT_RT_FULL
12231 +extern void softirq_early_init(void);
12233 +static inline void softirq_early_init(void) { }
12237 * Autoprobing for irqs:
12238 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/irqdesc.h linux-4.14/include/linux/irqdesc.h
12239 --- linux-4.14.orig/include/linux/irqdesc.h 2017-11-12 19:46:13.000000000 +0100
12240 +++ linux-4.14/include/linux/irqdesc.h 2018-09-05 11:05:07.000000000 +0200
12242 unsigned int irqs_unhandled;
12243 atomic_t threads_handled;
12244 int threads_handled_last;
12246 raw_spinlock_t lock;
12247 struct cpumask *percpu_enabled;
12248 const struct cpumask *percpu_affinity;
12249 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/irqflags.h linux-4.14/include/linux/irqflags.h
12250 --- linux-4.14.orig/include/linux/irqflags.h 2017-11-12 19:46:13.000000000 +0100
12251 +++ linux-4.14/include/linux/irqflags.h 2018-09-05 11:05:07.000000000 +0200
12253 current->hardirq_context--; \
12254 crossrelease_hist_end(XHLOCK_HARD); \
12256 -# define lockdep_softirq_enter() \
12258 - current->softirq_context++; \
12259 - crossrelease_hist_start(XHLOCK_SOFT); \
12261 -# define lockdep_softirq_exit() \
12263 - current->softirq_context--; \
12264 - crossrelease_hist_end(XHLOCK_SOFT); \
12266 # define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1,
12268 # define trace_hardirqs_on() do { } while (0)
12270 # define trace_softirqs_enabled(p) 0
12271 # define trace_hardirq_enter() do { } while (0)
12272 # define trace_hardirq_exit() do { } while (0)
12273 +# define INIT_TRACE_IRQFLAGS
12276 +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
12277 +# define lockdep_softirq_enter() \
12279 + current->softirq_context++; \
12280 + crossrelease_hist_start(XHLOCK_SOFT); \
12282 +# define lockdep_softirq_exit() \
12284 + current->softirq_context--; \
12285 + crossrelease_hist_end(XHLOCK_SOFT); \
12288 # define lockdep_softirq_enter() do { } while (0)
12289 # define lockdep_softirq_exit() do { } while (0)
12290 -# define INIT_TRACE_IRQFLAGS
12293 #if defined(CONFIG_IRQSOFF_TRACER) || \
12294 @@ -165,4 +169,23 @@
12296 #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
12299 + * local_irq* variants depending on RT/!RT
12301 +#ifdef CONFIG_PREEMPT_RT_FULL
12302 +# define local_irq_disable_nort() do { } while (0)
12303 +# define local_irq_enable_nort() do { } while (0)
12304 +# define local_irq_save_nort(flags) local_save_flags(flags)
12305 +# define local_irq_restore_nort(flags) (void)(flags)
12306 +# define local_irq_disable_rt() local_irq_disable()
12307 +# define local_irq_enable_rt() local_irq_enable()
12309 +# define local_irq_disable_nort() local_irq_disable()
12310 +# define local_irq_enable_nort() local_irq_enable()
12311 +# define local_irq_save_nort(flags) local_irq_save(flags)
12312 +# define local_irq_restore_nort(flags) local_irq_restore(flags)
12313 +# define local_irq_disable_rt() do { } while (0)
12314 +# define local_irq_enable_rt() do { } while (0)
12318 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/irq.h linux-4.14/include/linux/irq.h
12319 --- linux-4.14.orig/include/linux/irq.h 2018-09-05 11:03:22.000000000 +0200
12320 +++ linux-4.14/include/linux/irq.h 2018-09-05 11:05:07.000000000 +0200
12322 * IRQ_IS_POLLED - Always polled by another interrupt. Exclude
12323 * it from the spurious interrupt detection
12324 * mechanism and from core side polling.
12325 + * IRQ_NO_SOFTIRQ_CALL - No softirq processing in the irq thread context (RT)
12326 * IRQ_DISABLE_UNLAZY - Disable lazy irq disable
12329 @@ -101,13 +102,14 @@
12330 IRQ_PER_CPU_DEVID = (1 << 17),
12331 IRQ_IS_POLLED = (1 << 18),
12332 IRQ_DISABLE_UNLAZY = (1 << 19),
12333 + IRQ_NO_SOFTIRQ_CALL = (1 << 20),
12336 #define IRQF_MODIFY_MASK \
12337 (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
12338 IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
12339 IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
12340 - IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
12341 + IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
12343 #define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING)
12345 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/irq_work.h linux-4.14/include/linux/irq_work.h
12346 --- linux-4.14.orig/include/linux/irq_work.h 2017-11-12 19:46:13.000000000 +0100
12347 +++ linux-4.14/include/linux/irq_work.h 2018-09-05 11:05:07.000000000 +0200
12349 #define IRQ_WORK_BUSY 2UL
12350 #define IRQ_WORK_FLAGS 3UL
12351 #define IRQ_WORK_LAZY 4UL /* Doesn't want IPI, wait for tick */
12352 +#define IRQ_WORK_HARD_IRQ 8UL /* Run hard IRQ context, even on RT */
12355 unsigned long flags;
12357 static inline void irq_work_run(void) { }
12360 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
12361 +void irq_work_tick_soft(void);
12363 +static inline void irq_work_tick_soft(void) { }
12366 #endif /* _LINUX_IRQ_WORK_H */
12367 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/jbd2.h linux-4.14/include/linux/jbd2.h
12368 --- linux-4.14.orig/include/linux/jbd2.h 2018-09-05 11:03:22.000000000 +0200
12369 +++ linux-4.14/include/linux/jbd2.h 2018-09-05 11:05:07.000000000 +0200
12370 @@ -347,32 +347,56 @@
12372 static inline void jbd_lock_bh_state(struct buffer_head *bh)
12374 +#ifndef CONFIG_PREEMPT_RT_BASE
12375 bit_spin_lock(BH_State, &bh->b_state);
12377 + spin_lock(&bh->b_state_lock);
12381 static inline int jbd_trylock_bh_state(struct buffer_head *bh)
12383 +#ifndef CONFIG_PREEMPT_RT_BASE
12384 return bit_spin_trylock(BH_State, &bh->b_state);
12386 + return spin_trylock(&bh->b_state_lock);
12390 static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
12392 +#ifndef CONFIG_PREEMPT_RT_BASE
12393 return bit_spin_is_locked(BH_State, &bh->b_state);
12395 + return spin_is_locked(&bh->b_state_lock);
12399 static inline void jbd_unlock_bh_state(struct buffer_head *bh)
12401 +#ifndef CONFIG_PREEMPT_RT_BASE
12402 bit_spin_unlock(BH_State, &bh->b_state);
12404 + spin_unlock(&bh->b_state_lock);
12408 static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
12410 +#ifndef CONFIG_PREEMPT_RT_BASE
12411 bit_spin_lock(BH_JournalHead, &bh->b_state);
12413 + spin_lock(&bh->b_journal_head_lock);
12417 static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
12419 +#ifndef CONFIG_PREEMPT_RT_BASE
12420 bit_spin_unlock(BH_JournalHead, &bh->b_state);
12422 + spin_unlock(&bh->b_journal_head_lock);
12426 #define J_ASSERT(assert) BUG_ON(!(assert))
12427 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/kdb.h linux-4.14/include/linux/kdb.h
12428 --- linux-4.14.orig/include/linux/kdb.h 2017-11-12 19:46:13.000000000 +0100
12429 +++ linux-4.14/include/linux/kdb.h 2018-09-05 11:05:07.000000000 +0200
12430 @@ -167,6 +167,7 @@
12431 extern __printf(1, 2) int kdb_printf(const char *, ...);
12432 typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
12434 +#define in_kdb_printk() (kdb_trap_printk)
12435 extern void kdb_init(int level);
12437 /* Access to kdb specific polling devices */
12438 @@ -201,6 +202,7 @@
12439 extern int kdb_unregister(char *);
12440 #else /* ! CONFIG_KGDB_KDB */
12441 static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
12442 +#define in_kdb_printk() (0)
12443 static inline void kdb_init(int level) {}
12444 static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
12445 char *help, short minlen) { return 0; }
12446 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/kernel.h linux-4.14/include/linux/kernel.h
12447 --- linux-4.14.orig/include/linux/kernel.h 2017-11-12 19:46:13.000000000 +0100
12448 +++ linux-4.14/include/linux/kernel.h 2018-09-05 11:05:07.000000000 +0200
12449 @@ -225,6 +225,9 @@
12451 # define might_sleep() \
12452 do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12454 +# define might_sleep_no_state_check() \
12455 + do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12456 # define sched_annotate_sleep() (current->task_state_change = 0)
12458 static inline void ___might_sleep(const char *file, int line,
12459 @@ -232,6 +235,7 @@
12460 static inline void __might_sleep(const char *file, int line,
12461 int preempt_offset) { }
12462 # define might_sleep() do { might_resched(); } while (0)
12463 +# define might_sleep_no_state_check() do { might_resched(); } while (0)
12464 # define sched_annotate_sleep() do { } while (0)
12467 @@ -531,6 +535,7 @@
12474 #define TAINT_PROPRIETARY_MODULE 0
12475 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/list_bl.h linux-4.14/include/linux/list_bl.h
12476 --- linux-4.14.orig/include/linux/list_bl.h 2017-11-12 19:46:13.000000000 +0100
12477 +++ linux-4.14/include/linux/list_bl.h 2018-09-05 11:05:07.000000000 +0200
12479 #define _LINUX_LIST_BL_H
12481 #include <linux/list.h>
12482 +#include <linux/spinlock.h>
12483 #include <linux/bit_spinlock.h>
12486 @@ -33,13 +34,24 @@
12488 struct hlist_bl_head {
12489 struct hlist_bl_node *first;
12490 +#ifdef CONFIG_PREEMPT_RT_BASE
12491 + raw_spinlock_t lock;
12495 struct hlist_bl_node {
12496 struct hlist_bl_node *next, **pprev;
12498 -#define INIT_HLIST_BL_HEAD(ptr) \
12499 - ((ptr)->first = NULL)
12501 +#ifdef CONFIG_PREEMPT_RT_BASE
12502 +#define INIT_HLIST_BL_HEAD(h) \
12504 + (h)->first = NULL; \
12505 + raw_spin_lock_init(&(h)->lock); \
12508 +#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
12511 static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
12513 @@ -119,12 +131,26 @@
12515 static inline void hlist_bl_lock(struct hlist_bl_head *b)
12517 +#ifndef CONFIG_PREEMPT_RT_BASE
12518 bit_spin_lock(0, (unsigned long *)b);
12520 + raw_spin_lock(&b->lock);
12521 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
12522 + __set_bit(0, (unsigned long *)b);
12527 static inline void hlist_bl_unlock(struct hlist_bl_head *b)
12529 +#ifndef CONFIG_PREEMPT_RT_BASE
12530 __bit_spin_unlock(0, (unsigned long *)b);
12532 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
12533 + __clear_bit(0, (unsigned long *)b);
12535 + raw_spin_unlock(&b->lock);
12539 static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
12540 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/locallock.h linux-4.14/include/linux/locallock.h
12541 --- linux-4.14.orig/include/linux/locallock.h 1970-01-01 01:00:00.000000000 +0100
12542 +++ linux-4.14/include/linux/locallock.h 2018-09-05 11:05:07.000000000 +0200
12544 +#ifndef _LINUX_LOCALLOCK_H
12545 +#define _LINUX_LOCALLOCK_H
12547 +#include <linux/percpu.h>
12548 +#include <linux/spinlock.h>
12550 +#ifdef CONFIG_PREEMPT_RT_BASE
12552 +#ifdef CONFIG_DEBUG_SPINLOCK
12553 +# define LL_WARN(cond) WARN_ON(cond)
12555 +# define LL_WARN(cond) do { } while (0)
12559 + * per cpu lock based substitute for local_irq_*()
12561 +struct local_irq_lock {
12563 + struct task_struct *owner;
12565 + unsigned long flags;
12568 +#define DEFINE_LOCAL_IRQ_LOCK(lvar) \
12569 + DEFINE_PER_CPU(struct local_irq_lock, lvar) = { \
12570 + .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
12572 +#define DECLARE_LOCAL_IRQ_LOCK(lvar) \
12573 + DECLARE_PER_CPU(struct local_irq_lock, lvar)
12575 +#define local_irq_lock_init(lvar) \
12578 + for_each_possible_cpu(__cpu) \
12579 + spin_lock_init(&per_cpu(lvar, __cpu).lock); \
12582 +static inline void __local_lock(struct local_irq_lock *lv)
12584 + if (lv->owner != current) {
12585 + spin_lock(&lv->lock);
12586 + LL_WARN(lv->owner);
12587 + LL_WARN(lv->nestcnt);
12588 + lv->owner = current;
12593 +#define local_lock(lvar) \
12594 + do { __local_lock(&get_local_var(lvar)); } while (0)
12596 +#define local_lock_on(lvar, cpu) \
12597 + do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
12599 +static inline int __local_trylock(struct local_irq_lock *lv)
12601 + if (lv->owner != current && spin_trylock(&lv->lock)) {
12602 + LL_WARN(lv->owner);
12603 + LL_WARN(lv->nestcnt);
12604 + lv->owner = current;
12607 + } else if (lv->owner == current) {
12614 +#define local_trylock(lvar) \
12617 + __locked = __local_trylock(&get_local_var(lvar)); \
12619 + put_local_var(lvar); \
12623 +static inline void __local_unlock(struct local_irq_lock *lv)
12625 + LL_WARN(lv->nestcnt == 0);
12626 + LL_WARN(lv->owner != current);
12627 + if (--lv->nestcnt)
12630 + lv->owner = NULL;
12631 + spin_unlock(&lv->lock);
12634 +#define local_unlock(lvar) \
12636 + __local_unlock(this_cpu_ptr(&lvar)); \
12637 + put_local_var(lvar); \
12640 +#define local_unlock_on(lvar, cpu) \
12641 + do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
12643 +static inline void __local_lock_irq(struct local_irq_lock *lv)
12645 + spin_lock_irqsave(&lv->lock, lv->flags);
12646 + LL_WARN(lv->owner);
12647 + LL_WARN(lv->nestcnt);
12648 + lv->owner = current;
12652 +#define local_lock_irq(lvar) \
12653 + do { __local_lock_irq(&get_local_var(lvar)); } while (0)
12655 +#define local_lock_irq_on(lvar, cpu) \
12656 + do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
12658 +static inline void __local_unlock_irq(struct local_irq_lock *lv)
12660 + LL_WARN(!lv->nestcnt);
12661 + LL_WARN(lv->owner != current);
12662 + lv->owner = NULL;
12664 + spin_unlock_irq(&lv->lock);
12667 +#define local_unlock_irq(lvar) \
12669 + __local_unlock_irq(this_cpu_ptr(&lvar)); \
12670 + put_local_var(lvar); \
12673 +#define local_unlock_irq_on(lvar, cpu) \
12675 + __local_unlock_irq(&per_cpu(lvar, cpu)); \
12678 +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
12680 + if (lv->owner != current) {
12681 + __local_lock_irq(lv);
12689 +#define local_lock_irqsave(lvar, _flags) \
12691 + if (__local_lock_irqsave(&get_local_var(lvar))) \
12692 + put_local_var(lvar); \
12693 + _flags = __this_cpu_read(lvar.flags); \
12696 +#define local_lock_irqsave_on(lvar, _flags, cpu) \
12698 + __local_lock_irqsave(&per_cpu(lvar, cpu)); \
12699 + _flags = per_cpu(lvar, cpu).flags; \
12702 +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
12703 + unsigned long flags)
12705 + LL_WARN(!lv->nestcnt);
12706 + LL_WARN(lv->owner != current);
12707 + if (--lv->nestcnt)
12710 + lv->owner = NULL;
12711 + spin_unlock_irqrestore(&lv->lock, lv->flags);
12715 +#define local_unlock_irqrestore(lvar, flags) \
12717 + if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
12718 + put_local_var(lvar); \
12721 +#define local_unlock_irqrestore_on(lvar, flags, cpu) \
12723 + __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags); \
12726 +#define local_spin_trylock_irq(lvar, lock) \
12729 + local_lock_irq(lvar); \
12730 + __locked = spin_trylock(lock); \
12732 + local_unlock_irq(lvar); \
12736 +#define local_spin_lock_irq(lvar, lock) \
12738 + local_lock_irq(lvar); \
12739 + spin_lock(lock); \
12742 +#define local_spin_unlock_irq(lvar, lock) \
12744 + spin_unlock(lock); \
12745 + local_unlock_irq(lvar); \
12748 +#define local_spin_lock_irqsave(lvar, lock, flags) \
12750 + local_lock_irqsave(lvar, flags); \
12751 + spin_lock(lock); \
12754 +#define local_spin_unlock_irqrestore(lvar, lock, flags) \
12756 + spin_unlock(lock); \
12757 + local_unlock_irqrestore(lvar, flags); \
12760 +#define get_locked_var(lvar, var) \
12762 + local_lock(lvar); \
12763 + this_cpu_ptr(&var); \
12766 +#define put_locked_var(lvar, var) local_unlock(lvar);
12768 +#define local_lock_cpu(lvar) \
12770 + local_lock(lvar); \
12771 + smp_processor_id(); \
12774 +#define local_unlock_cpu(lvar) local_unlock(lvar)
12776 +#else /* PREEMPT_RT_BASE */
12778 +#define DEFINE_LOCAL_IRQ_LOCK(lvar) __typeof__(const int) lvar
12779 +#define DECLARE_LOCAL_IRQ_LOCK(lvar) extern __typeof__(const int) lvar
12781 +static inline void local_irq_lock_init(int lvar) { }
12783 +#define local_trylock(lvar) \
12785 + preempt_disable(); \
12789 +#define local_lock(lvar) preempt_disable()
12790 +#define local_unlock(lvar) preempt_enable()
12791 +#define local_lock_irq(lvar) local_irq_disable()
12792 +#define local_lock_irq_on(lvar, cpu) local_irq_disable()
12793 +#define local_unlock_irq(lvar) local_irq_enable()
12794 +#define local_unlock_irq_on(lvar, cpu) local_irq_enable()
12795 +#define local_lock_irqsave(lvar, flags) local_irq_save(flags)
12796 +#define local_unlock_irqrestore(lvar, flags) local_irq_restore(flags)
12798 +#define local_spin_trylock_irq(lvar, lock) spin_trylock_irq(lock)
12799 +#define local_spin_lock_irq(lvar, lock) spin_lock_irq(lock)
12800 +#define local_spin_unlock_irq(lvar, lock) spin_unlock_irq(lock)
12801 +#define local_spin_lock_irqsave(lvar, lock, flags) \
12802 + spin_lock_irqsave(lock, flags)
12803 +#define local_spin_unlock_irqrestore(lvar, lock, flags) \
12804 + spin_unlock_irqrestore(lock, flags)
12806 +#define get_locked_var(lvar, var) get_cpu_var(var)
12807 +#define put_locked_var(lvar, var) put_cpu_var(var)
12809 +#define local_lock_cpu(lvar) get_cpu()
12810 +#define local_unlock_cpu(lvar) put_cpu()
12815 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/mm_types.h linux-4.14/include/linux/mm_types.h
12816 --- linux-4.14.orig/include/linux/mm_types.h 2018-09-05 11:03:28.000000000 +0200
12817 +++ linux-4.14/include/linux/mm_types.h 2018-09-05 11:05:07.000000000 +0200
12819 #include <linux/completion.h>
12820 #include <linux/cpumask.h>
12821 #include <linux/uprobes.h>
12822 +#include <linux/rcupdate.h>
12823 #include <linux/page-flags-layout.h>
12824 #include <linux/workqueue.h>
12826 @@ -498,6 +499,9 @@
12827 bool tlb_flush_batched;
12829 struct uprobes_state uprobes_state;
12830 +#ifdef CONFIG_PREEMPT_RT_BASE
12831 + struct rcu_head delayed_drop;
12833 #ifdef CONFIG_HUGETLB_PAGE
12834 atomic_long_t hugetlb_usage;
12836 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/mutex.h linux-4.14/include/linux/mutex.h
12837 --- linux-4.14.orig/include/linux/mutex.h 2017-11-12 19:46:13.000000000 +0100
12838 +++ linux-4.14/include/linux/mutex.h 2018-09-05 11:05:07.000000000 +0200
12841 struct ww_acquire_ctx;
12843 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12844 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
12845 + , .dep_map = { .name = #lockname }
12847 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
12850 +#ifdef CONFIG_PREEMPT_RT_FULL
12851 +# include <linux/mutex_rt.h>
12855 * Simple, straightforward mutexes with strict semantics:
12857 @@ -114,13 +125,6 @@
12858 __mutex_init((mutex), #mutex, &__key); \
12861 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
12862 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
12863 - , .dep_map = { .name = #lockname }
12865 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
12868 #define __MUTEX_INITIALIZER(lockname) \
12869 { .owner = ATOMIC_LONG_INIT(0) \
12870 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
12871 @@ -228,4 +232,6 @@
12872 return mutex_trylock(lock);
12875 +#endif /* !PREEMPT_RT_FULL */
12877 #endif /* __LINUX_MUTEX_H */
12878 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/mutex_rt.h linux-4.14/include/linux/mutex_rt.h
12879 --- linux-4.14.orig/include/linux/mutex_rt.h 1970-01-01 01:00:00.000000000 +0100
12880 +++ linux-4.14/include/linux/mutex_rt.h 2018-09-05 11:05:07.000000000 +0200
12882 +#ifndef __LINUX_MUTEX_RT_H
12883 +#define __LINUX_MUTEX_RT_H
12885 +#ifndef __LINUX_MUTEX_H
12886 +#error "Please include mutex.h"
12889 +#include <linux/rtmutex.h>
12891 +/* FIXME: Just for __lockfunc */
12892 +#include <linux/spinlock.h>
12895 + struct rt_mutex lock;
12896 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12897 + struct lockdep_map dep_map;
12901 +#define __MUTEX_INITIALIZER(mutexname) \
12903 + .lock = __RT_MUTEX_INITIALIZER(mutexname.lock) \
12904 + __DEP_MAP_MUTEX_INITIALIZER(mutexname) \
12907 +#define DEFINE_MUTEX(mutexname) \
12908 + struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
12910 +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
12911 +extern void __lockfunc _mutex_lock(struct mutex *lock);
12912 +extern void __lockfunc _mutex_lock_io(struct mutex *lock);
12913 +extern void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass);
12914 +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
12915 +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
12916 +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
12917 +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
12918 +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
12919 +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
12920 +extern int __lockfunc _mutex_trylock(struct mutex *lock);
12921 +extern void __lockfunc _mutex_unlock(struct mutex *lock);
12923 +#define mutex_is_locked(l) rt_mutex_is_locked(&(l)->lock)
12924 +#define mutex_lock(l) _mutex_lock(l)
12925 +#define mutex_lock_interruptible(l) _mutex_lock_interruptible(l)
12926 +#define mutex_lock_killable(l) _mutex_lock_killable(l)
12927 +#define mutex_trylock(l) _mutex_trylock(l)
12928 +#define mutex_unlock(l) _mutex_unlock(l)
12929 +#define mutex_lock_io(l) _mutex_lock_io(l);
12931 +#define __mutex_owner(l) ((l)->lock.owner)
12933 +#ifdef CONFIG_DEBUG_MUTEXES
12934 +#define mutex_destroy(l) rt_mutex_destroy(&(l)->lock)
12936 +static inline void mutex_destroy(struct mutex *lock) {}
12939 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12940 +# define mutex_lock_nested(l, s) _mutex_lock_nested(l, s)
12941 +# define mutex_lock_interruptible_nested(l, s) \
12942 + _mutex_lock_interruptible_nested(l, s)
12943 +# define mutex_lock_killable_nested(l, s) \
12944 + _mutex_lock_killable_nested(l, s)
12945 +# define mutex_lock_io_nested(l, s) _mutex_lock_io_nested(l, s)
12947 +# define mutex_lock_nest_lock(lock, nest_lock) \
12949 + typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \
12950 + _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \
12954 +# define mutex_lock_nested(l, s) _mutex_lock(l)
12955 +# define mutex_lock_interruptible_nested(l, s) \
12956 + _mutex_lock_interruptible(l)
12957 +# define mutex_lock_killable_nested(l, s) \
12958 + _mutex_lock_killable(l)
12959 +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
12960 +# define mutex_lock_io_nested(l, s) _mutex_lock_io(l)
12963 +# define mutex_init(mutex) \
12965 + static struct lock_class_key __key; \
12967 + rt_mutex_init(&(mutex)->lock); \
12968 + __mutex_do_init((mutex), #mutex, &__key); \
12971 +# define __mutex_init(mutex, name, key) \
12973 + rt_mutex_init(&(mutex)->lock); \
12974 + __mutex_do_init((mutex), name, key); \
12978 + * These values are chosen such that FAIL and SUCCESS match the
12979 + * values of the regular mutex_trylock().
12981 +enum mutex_trylock_recursive_enum {
12982 + MUTEX_TRYLOCK_FAILED = 0,
12983 + MUTEX_TRYLOCK_SUCCESS = 1,
12984 + MUTEX_TRYLOCK_RECURSIVE,
12987 + * mutex_trylock_recursive - trylock variant that allows recursive locking
12988 + * @lock: mutex to be locked
12990 + * This function should not be used, _ever_. It is purely for hysterical GEM
12991 + * raisins, and once those are gone this will be removed.
12994 + * MUTEX_TRYLOCK_FAILED - trylock failed,
12995 + * MUTEX_TRYLOCK_SUCCESS - lock acquired,
12996 + * MUTEX_TRYLOCK_RECURSIVE - we already owned the lock.
12998 +int __rt_mutex_owner_current(struct rt_mutex *lock);
13000 +static inline /* __deprecated */ __must_check enum mutex_trylock_recursive_enum
13001 +mutex_trylock_recursive(struct mutex *lock)
13003 + if (unlikely(__rt_mutex_owner_current(&lock->lock)))
13004 + return MUTEX_TRYLOCK_RECURSIVE;
13006 + return mutex_trylock(lock);
13009 +extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
13012 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/netdevice.h linux-4.14/include/linux/netdevice.h
13013 --- linux-4.14.orig/include/linux/netdevice.h 2018-09-05 11:03:22.000000000 +0200
13014 +++ linux-4.14/include/linux/netdevice.h 2018-09-05 11:05:07.000000000 +0200
13015 @@ -409,7 +409,19 @@
13016 typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
13018 void __napi_schedule(struct napi_struct *n);
13021 + * When PREEMPT_RT_FULL is defined, all device interrupt handlers
13022 + * run as threads, and they can also be preempted (without PREEMPT_RT
13023 + * interrupt threads can not be preempted). Which means that calling
13024 + * __napi_schedule_irqoff() from an interrupt handler can be preempted
13025 + * and can corrupt the napi->poll_list.
13027 +#ifdef CONFIG_PREEMPT_RT_FULL
13028 +#define __napi_schedule_irqoff(n) __napi_schedule(n)
13030 void __napi_schedule_irqoff(struct napi_struct *n);
13033 static inline bool napi_disable_pending(struct napi_struct *n)
13035 @@ -571,7 +583,11 @@
13036 * write-mostly part
13038 spinlock_t _xmit_lock ____cacheline_aligned_in_smp;
13039 +#ifdef CONFIG_PREEMPT_RT_FULL
13040 + struct task_struct *xmit_lock_owner;
13042 int xmit_lock_owner;
13045 * Time (in jiffies) of last Tx
13047 @@ -2433,14 +2449,53 @@
13048 void synchronize_net(void);
13049 int init_dummy_netdev(struct net_device *dev);
13051 -DECLARE_PER_CPU(int, xmit_recursion);
13052 #define XMIT_RECURSION_LIMIT 10
13053 +#ifdef CONFIG_PREEMPT_RT_FULL
13054 +static inline int dev_recursion_level(void)
13056 + return current->xmit_recursion;
13059 +static inline int xmit_rec_read(void)
13061 + return current->xmit_recursion;
13064 +static inline void xmit_rec_inc(void)
13066 + current->xmit_recursion++;
13069 +static inline void xmit_rec_dec(void)
13071 + current->xmit_recursion--;
13076 +DECLARE_PER_CPU(int, xmit_recursion);
13078 static inline int dev_recursion_level(void)
13080 return this_cpu_read(xmit_recursion);
13083 +static inline int xmit_rec_read(void)
13085 + return __this_cpu_read(xmit_recursion);
13088 +static inline void xmit_rec_inc(void)
13090 + __this_cpu_inc(xmit_recursion);
13093 +static inline void xmit_rec_dec(void)
13095 + __this_cpu_dec(xmit_recursion);
13099 struct net_device *dev_get_by_index(struct net *net, int ifindex);
13100 struct net_device *__dev_get_by_index(struct net *net, int ifindex);
13101 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
13102 @@ -2792,6 +2847,7 @@
13103 unsigned int dropped;
13104 struct sk_buff_head input_pkt_queue;
13105 struct napi_struct backlog;
13106 + struct sk_buff_head tofree_queue;
13110 @@ -3515,10 +3571,48 @@
13111 return (1 << debug_value) - 1;
13114 +#ifdef CONFIG_PREEMPT_RT_FULL
13115 +static inline void netdev_queue_set_owner(struct netdev_queue *txq, int cpu)
13117 + txq->xmit_lock_owner = current;
13120 +static inline void netdev_queue_clear_owner(struct netdev_queue *txq)
13122 + txq->xmit_lock_owner = NULL;
13125 +static inline bool netdev_queue_has_owner(struct netdev_queue *txq)
13127 + if (txq->xmit_lock_owner != NULL)
13134 +static inline void netdev_queue_set_owner(struct netdev_queue *txq, int cpu)
13136 + txq->xmit_lock_owner = cpu;
13139 +static inline void netdev_queue_clear_owner(struct netdev_queue *txq)
13141 + txq->xmit_lock_owner = -1;
13144 +static inline bool netdev_queue_has_owner(struct netdev_queue *txq)
13146 + if (txq->xmit_lock_owner != -1)
13152 static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
13154 spin_lock(&txq->_xmit_lock);
13155 - txq->xmit_lock_owner = cpu;
13156 + netdev_queue_set_owner(txq, cpu);
13159 static inline bool __netif_tx_acquire(struct netdev_queue *txq)
13160 @@ -3535,32 +3629,32 @@
13161 static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
13163 spin_lock_bh(&txq->_xmit_lock);
13164 - txq->xmit_lock_owner = smp_processor_id();
13165 + netdev_queue_set_owner(txq, smp_processor_id());
13168 static inline bool __netif_tx_trylock(struct netdev_queue *txq)
13170 bool ok = spin_trylock(&txq->_xmit_lock);
13172 - txq->xmit_lock_owner = smp_processor_id();
13173 + netdev_queue_set_owner(txq, smp_processor_id());
13177 static inline void __netif_tx_unlock(struct netdev_queue *txq)
13179 - txq->xmit_lock_owner = -1;
13180 + netdev_queue_clear_owner(txq);
13181 spin_unlock(&txq->_xmit_lock);
13184 static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
13186 - txq->xmit_lock_owner = -1;
13187 + netdev_queue_clear_owner(txq);
13188 spin_unlock_bh(&txq->_xmit_lock);
13191 static inline void txq_trans_update(struct netdev_queue *txq)
13193 - if (txq->xmit_lock_owner != -1)
13194 + if (netdev_queue_has_owner(txq))
13195 txq->trans_start = jiffies;
13198 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/netfilter/x_tables.h linux-4.14/include/linux/netfilter/x_tables.h
13199 --- linux-4.14.orig/include/linux/netfilter/x_tables.h 2018-09-05 11:03:22.000000000 +0200
13200 +++ linux-4.14/include/linux/netfilter/x_tables.h 2018-09-05 11:05:07.000000000 +0200
13202 #include <linux/netdevice.h>
13203 #include <linux/static_key.h>
13204 #include <linux/netfilter.h>
13205 +#include <linux/locallock.h>
13206 #include <uapi/linux/netfilter/x_tables.h>
13208 /* Test a struct->invflags and a boolean for inequality */
13209 @@ -341,6 +342,8 @@
13211 DECLARE_PER_CPU(seqcount_t, xt_recseq);
13213 +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
13215 /* xt_tee_enabled - true if x_tables needs to handle reentrancy
13217 * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
13218 @@ -361,6 +364,9 @@
13220 unsigned int addend;
13222 + /* RT protection */
13223 + local_lock(xt_write_lock);
13226 * Low order bit of sequence is set if we already
13227 * called xt_write_recseq_begin().
13228 @@ -391,6 +397,7 @@
13229 /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
13231 __this_cpu_add(xt_recseq.sequence, addend);
13232 + local_unlock(xt_write_lock);
13236 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/nfs_fs.h linux-4.14/include/linux/nfs_fs.h
13237 --- linux-4.14.orig/include/linux/nfs_fs.h 2017-11-12 19:46:13.000000000 +0100
13238 +++ linux-4.14/include/linux/nfs_fs.h 2018-09-05 11:05:07.000000000 +0200
13239 @@ -162,7 +162,11 @@
13241 /* Readers: in-flight sillydelete RPC calls */
13242 /* Writers: rmdir */
13243 +#ifdef CONFIG_PREEMPT_RT_BASE
13244 + struct semaphore rmdir_sem;
13246 struct rw_semaphore rmdir_sem;
13248 struct mutex commit_mutex;
13250 #if IS_ENABLED(CONFIG_NFS_V4)
13251 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/nfs_xdr.h linux-4.14/include/linux/nfs_xdr.h
13252 --- linux-4.14.orig/include/linux/nfs_xdr.h 2017-11-12 19:46:13.000000000 +0100
13253 +++ linux-4.14/include/linux/nfs_xdr.h 2018-09-05 11:05:07.000000000 +0200
13254 @@ -1530,7 +1530,7 @@
13255 struct nfs_removeargs args;
13256 struct nfs_removeres res;
13257 struct dentry *dentry;
13258 - wait_queue_head_t wq;
13259 + struct swait_queue_head wq;
13260 struct rpc_cred *cred;
13261 struct nfs_fattr dir_attr;
13263 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/notifier.h linux-4.14/include/linux/notifier.h
13264 --- linux-4.14.orig/include/linux/notifier.h 2017-11-12 19:46:13.000000000 +0100
13265 +++ linux-4.14/include/linux/notifier.h 2018-09-05 11:05:07.000000000 +0200
13268 * Alan Cox <Alan.Cox@linux.org>
13272 #ifndef _LINUX_NOTIFIER_H
13273 #define _LINUX_NOTIFIER_H
13274 #include <linux/errno.h>
13276 * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
13277 * As compensation, srcu_notifier_chain_unregister() is rather expensive.
13278 * SRCU notifier chains should be used when the chain will be called very
13279 - * often but notifier_blocks will seldom be removed. Also, SRCU notifier
13280 - * chains are slightly more difficult to use because they require special
13281 - * runtime initialization.
13282 + * often but notifier_blocks will seldom be removed.
13285 struct notifier_block;
13287 (name)->head = NULL; \
13290 -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
13291 +/* srcu_notifier_heads must be cleaned up dynamically */
13292 extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13293 #define srcu_cleanup_notifier_head(name) \
13294 cleanup_srcu_struct(&(name)->srcu);
13295 @@ -104,7 +102,13 @@
13297 #define RAW_NOTIFIER_INIT(name) { \
13299 -/* srcu_notifier_heads cannot be initialized statically */
13301 +#define SRCU_NOTIFIER_INIT(name, pcpu) \
13303 + .mutex = __MUTEX_INITIALIZER(name.mutex), \
13305 + .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu), \
13308 #define ATOMIC_NOTIFIER_HEAD(name) \
13309 struct atomic_notifier_head name = \
13310 @@ -116,6 +120,26 @@
13311 struct raw_notifier_head name = \
13312 RAW_NOTIFIER_INIT(name)
13314 +#ifdef CONFIG_TREE_SRCU
13315 +#define _SRCU_NOTIFIER_HEAD(name, mod) \
13316 + static DEFINE_PER_CPU(struct srcu_data, \
13317 + name##_head_srcu_data); \
13318 + mod struct srcu_notifier_head name = \
13319 + SRCU_NOTIFIER_INIT(name, name##_head_srcu_data)
13322 +#define _SRCU_NOTIFIER_HEAD(name, mod) \
13323 + mod struct srcu_notifier_head name = \
13324 + SRCU_NOTIFIER_INIT(name, name)
13328 +#define SRCU_NOTIFIER_HEAD(name) \
13329 + _SRCU_NOTIFIER_HEAD(name, )
13331 +#define SRCU_NOTIFIER_HEAD_STATIC(name) \
13332 + _SRCU_NOTIFIER_HEAD(name, static)
13336 extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
13337 @@ -185,12 +209,12 @@
13340 * Declared notifiers so far. I can imagine quite a few more chains
13341 - * over time (eg laptop power reset chains, reboot chain (to clean
13342 + * over time (eg laptop power reset chains, reboot chain (to clean
13343 * device units up), device [un]mount chain, module load/unload chain,
13344 - * low memory chain, screenblank chain (for plug in modular screenblankers)
13345 + * low memory chain, screenblank chain (for plug in modular screenblankers)
13346 * VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
13350 /* CPU notfiers are defined in include/linux/cpu.h. */
13352 /* netdevice notifiers are defined in include/linux/netdevice.h */
13353 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/percpu.h linux-4.14/include/linux/percpu.h
13354 --- linux-4.14.orig/include/linux/percpu.h 2017-11-12 19:46:13.000000000 +0100
13355 +++ linux-4.14/include/linux/percpu.h 2018-09-05 11:05:07.000000000 +0200
13357 #define PERCPU_MODULE_RESERVE 0
13360 +#ifdef CONFIG_PREEMPT_RT_FULL
13362 +#define get_local_var(var) (*({ \
13363 + migrate_disable(); \
13364 + this_cpu_ptr(&var); }))
13366 +#define put_local_var(var) do { \
13368 + migrate_enable(); \
13371 +# define get_local_ptr(var) ({ \
13372 + migrate_disable(); \
13373 + this_cpu_ptr(var); })
13375 +# define put_local_ptr(var) do { \
13377 + migrate_enable(); \
13382 +#define get_local_var(var) get_cpu_var(var)
13383 +#define put_local_var(var) put_cpu_var(var)
13384 +#define get_local_ptr(var) get_cpu_ptr(var)
13385 +#define put_local_ptr(var) put_cpu_ptr(var)
13389 /* minimum unit size, also is the maximum supported allocation size */
13390 #define PCPU_MIN_UNIT_SIZE PFN_ALIGN(32 << 10)
13392 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/percpu-rwsem.h linux-4.14/include/linux/percpu-rwsem.h
13393 --- linux-4.14.orig/include/linux/percpu-rwsem.h 2018-09-05 11:03:22.000000000 +0200
13394 +++ linux-4.14/include/linux/percpu-rwsem.h 2018-09-05 11:05:07.000000000 +0200
13396 extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
13397 extern void __percpu_up_read(struct percpu_rw_semaphore *);
13399 -static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem)
13400 +static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
13404 @@ -47,16 +47,10 @@
13405 __this_cpu_inc(*sem->read_count);
13406 if (unlikely(!rcu_sync_is_idle(&sem->rss)))
13407 __percpu_down_read(sem, false); /* Unconditional memory barrier */
13410 - * The barrier() prevents the compiler from
13411 + * The preempt_enable() prevents the compiler from
13412 * bleeding the critical section out.
13416 -static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
13418 - percpu_down_read_preempt_disable(sem);
13426 -static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem)
13427 +static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
13430 - * The barrier() prevents the compiler from
13431 - * bleeding the critical section out.
13434 + preempt_disable();
13436 * Same as in percpu_down_read().
13438 @@ -102,12 +92,6 @@
13439 rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
13442 -static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
13444 - preempt_disable();
13445 - percpu_up_read_preempt_enable(sem);
13448 extern void percpu_down_write(struct percpu_rw_semaphore *);
13449 extern void percpu_up_write(struct percpu_rw_semaphore *);
13451 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/pid.h linux-4.14/include/linux/pid.h
13452 --- linux-4.14.orig/include/linux/pid.h 2017-11-12 19:46:13.000000000 +0100
13453 +++ linux-4.14/include/linux/pid.h 2018-09-05 11:05:07.000000000 +0200
13455 #define _LINUX_PID_H
13457 #include <linux/rculist.h>
13458 +#include <linux/atomic.h>
13462 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/posix-timers.h linux-4.14/include/linux/posix-timers.h
13463 --- linux-4.14.orig/include/linux/posix-timers.h 2017-11-12 19:46:13.000000000 +0100
13464 +++ linux-4.14/include/linux/posix-timers.h 2018-09-05 11:05:07.000000000 +0200
13465 @@ -101,8 +101,8 @@
13467 struct alarm alarmtimer;
13469 - struct rcu_head rcu;
13471 + struct rcu_head rcu;
13474 void run_posix_cpu_timers(struct task_struct *task);
13475 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/preempt.h linux-4.14/include/linux/preempt.h
13476 --- linux-4.14.orig/include/linux/preempt.h 2017-11-12 19:46:13.000000000 +0100
13477 +++ linux-4.14/include/linux/preempt.h 2018-09-05 11:05:07.000000000 +0200
13479 #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
13480 #define NMI_OFFSET (1UL << NMI_SHIFT)
13482 -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
13483 +#ifndef CONFIG_PREEMPT_RT_FULL
13484 +# define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
13486 +# define SOFTIRQ_DISABLE_OFFSET (0)
13489 /* We use the MSB mostly because its available */
13490 #define PREEMPT_NEED_RESCHED 0x80000000
13492 #include <asm/preempt.h>
13494 #define hardirq_count() (preempt_count() & HARDIRQ_MASK)
13495 -#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
13496 #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
13498 +#ifndef CONFIG_PREEMPT_RT_FULL
13499 +# define softirq_count() (preempt_count() & SOFTIRQ_MASK)
13500 +# define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
13502 +# define softirq_count() (0UL)
13503 +extern int in_serving_softirq(void);
13507 * Are we doing bottom half or hardware interrupt processing?
13508 @@ -101,7 +111,6 @@
13509 #define in_irq() (hardirq_count())
13510 #define in_softirq() (softirq_count())
13511 #define in_interrupt() (irq_count())
13512 -#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
13513 #define in_nmi() (preempt_count() & NMI_MASK)
13514 #define in_task() (!(preempt_count() & \
13515 (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
13516 @@ -118,7 +127,11 @@
13518 * The preempt_count offset after spin_lock()
13520 +#if !defined(CONFIG_PREEMPT_RT_FULL)
13521 #define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET
13523 +#define PREEMPT_LOCK_OFFSET 0
13527 * The preempt_count offset needed for things like:
13528 @@ -167,6 +180,20 @@
13529 #define preempt_count_inc() preempt_count_add(1)
13530 #define preempt_count_dec() preempt_count_sub(1)
13532 +#ifdef CONFIG_PREEMPT_LAZY
13533 +#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0)
13534 +#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0)
13535 +#define inc_preempt_lazy_count() add_preempt_lazy_count(1)
13536 +#define dec_preempt_lazy_count() sub_preempt_lazy_count(1)
13537 +#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count)
13539 +#define add_preempt_lazy_count(val) do { } while (0)
13540 +#define sub_preempt_lazy_count(val) do { } while (0)
13541 +#define inc_preempt_lazy_count() do { } while (0)
13542 +#define dec_preempt_lazy_count() do { } while (0)
13543 +#define preempt_lazy_count() (0)
13546 #ifdef CONFIG_PREEMPT_COUNT
13548 #define preempt_disable() \
13549 @@ -175,16 +202,53 @@
13553 +#define preempt_lazy_disable() \
13555 + inc_preempt_lazy_count(); \
13559 #define sched_preempt_enable_no_resched() \
13562 preempt_count_dec(); \
13565 -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
13566 +#ifdef CONFIG_PREEMPT_RT_BASE
13567 +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
13568 +# define preempt_check_resched_rt() preempt_check_resched()
13570 +# define preempt_enable_no_resched() preempt_enable()
13571 +# define preempt_check_resched_rt() barrier();
13574 #define preemptible() (preempt_count() == 0 && !irqs_disabled())
13578 +extern void migrate_disable(void);
13579 +extern void migrate_enable(void);
13581 +int __migrate_disabled(struct task_struct *p);
13583 +#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
13585 +extern void migrate_disable(void);
13586 +extern void migrate_enable(void);
13587 +static inline int __migrate_disabled(struct task_struct *p)
13593 +#define migrate_disable() barrier()
13594 +#define migrate_enable() barrier()
13595 +static inline int __migrate_disabled(struct task_struct *p)
13601 #ifdef CONFIG_PREEMPT
13602 #define preempt_enable() \
13604 @@ -206,6 +270,13 @@
13605 __preempt_schedule(); \
13608 +#define preempt_lazy_enable() \
13610 + dec_preempt_lazy_count(); \
13612 + preempt_check_resched(); \
13615 #else /* !CONFIG_PREEMPT */
13616 #define preempt_enable() \
13618 @@ -213,6 +284,12 @@
13619 preempt_count_dec(); \
13622 +#define preempt_lazy_enable() \
13624 + dec_preempt_lazy_count(); \
13628 #define preempt_enable_notrace() \
13631 @@ -251,8 +328,16 @@
13632 #define preempt_disable_notrace() barrier()
13633 #define preempt_enable_no_resched_notrace() barrier()
13634 #define preempt_enable_notrace() barrier()
13635 +#define preempt_check_resched_rt() barrier()
13636 #define preemptible() 0
13638 +#define migrate_disable() barrier()
13639 +#define migrate_enable() barrier()
13641 +static inline int __migrate_disabled(struct task_struct *p)
13645 #endif /* CONFIG_PREEMPT_COUNT */
13648 @@ -271,10 +356,22 @@
13650 #define preempt_fold_need_resched() \
13652 - if (tif_need_resched()) \
13653 + if (tif_need_resched_now()) \
13654 set_preempt_need_resched(); \
13657 +#ifdef CONFIG_PREEMPT_RT_FULL
13658 +# define preempt_disable_rt() preempt_disable()
13659 +# define preempt_enable_rt() preempt_enable()
13660 +# define preempt_disable_nort() barrier()
13661 +# define preempt_enable_nort() barrier()
13663 +# define preempt_disable_rt() barrier()
13664 +# define preempt_enable_rt() barrier()
13665 +# define preempt_disable_nort() preempt_disable()
13666 +# define preempt_enable_nort() preempt_enable()
13669 #ifdef CONFIG_PREEMPT_NOTIFIERS
13671 struct preempt_notifier;
13672 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/printk.h linux-4.14/include/linux/printk.h
13673 --- linux-4.14.orig/include/linux/printk.h 2017-11-12 19:46:13.000000000 +0100
13674 +++ linux-4.14/include/linux/printk.h 2018-09-05 11:05:07.000000000 +0200
13675 @@ -142,9 +142,11 @@
13676 #ifdef CONFIG_EARLY_PRINTK
13677 extern asmlinkage __printf(1, 2)
13678 void early_printk(const char *fmt, ...);
13679 +extern void printk_kill(void);
13681 static inline __printf(1, 2) __cold
13682 void early_printk(const char *s, ...) { }
13683 +static inline void printk_kill(void) { }
13686 #ifdef CONFIG_PRINTK_NMI
13687 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/radix-tree.h linux-4.14/include/linux/radix-tree.h
13688 --- linux-4.14.orig/include/linux/radix-tree.h 2017-11-12 19:46:13.000000000 +0100
13689 +++ linux-4.14/include/linux/radix-tree.h 2018-09-05 11:05:07.000000000 +0200
13690 @@ -328,6 +328,8 @@
13691 int radix_tree_preload(gfp_t gfp_mask);
13692 int radix_tree_maybe_preload(gfp_t gfp_mask);
13693 int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
13694 +void radix_tree_preload_end(void);
13696 void radix_tree_init(void);
13697 void *radix_tree_tag_set(struct radix_tree_root *,
13698 unsigned long index, unsigned int tag);
13699 @@ -347,11 +349,6 @@
13700 unsigned int max_items, unsigned int tag);
13701 int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag);
13703 -static inline void radix_tree_preload_end(void)
13705 - preempt_enable();
13708 int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t);
13709 int radix_tree_split(struct radix_tree_root *, unsigned long index,
13710 unsigned new_order);
13711 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/random.h linux-4.14/include/linux/random.h
13712 --- linux-4.14.orig/include/linux/random.h 2017-11-12 19:46:13.000000000 +0100
13713 +++ linux-4.14/include/linux/random.h 2018-09-05 11:05:07.000000000 +0200
13716 extern void add_input_randomness(unsigned int type, unsigned int code,
13717 unsigned int value) __latent_entropy;
13718 -extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
13719 +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy;
13721 extern void get_random_bytes(void *buf, int nbytes);
13722 extern int wait_for_random_bytes(void);
13723 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rbtree_augmented.h linux-4.14/include/linux/rbtree_augmented.h
13724 --- linux-4.14.orig/include/linux/rbtree_augmented.h 2017-11-12 19:46:13.000000000 +0100
13725 +++ linux-4.14/include/linux/rbtree_augmented.h 2018-09-05 11:05:07.000000000 +0200
13728 #include <linux/compiler.h>
13729 #include <linux/rbtree.h>
13730 +#include <linux/rcupdate.h>
13733 * Please note - only struct rb_augment_callbacks and the prototypes for
13734 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rbtree.h linux-4.14/include/linux/rbtree.h
13735 --- linux-4.14.orig/include/linux/rbtree.h 2017-11-12 19:46:13.000000000 +0100
13736 +++ linux-4.14/include/linux/rbtree.h 2018-09-05 11:05:07.000000000 +0200
13739 #include <linux/kernel.h>
13740 #include <linux/stddef.h>
13741 -#include <linux/rcupdate.h>
13742 +#include <linux/rcu_assign_pointer.h>
13745 unsigned long __rb_parent_color;
13746 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rbtree_latch.h linux-4.14/include/linux/rbtree_latch.h
13747 --- linux-4.14.orig/include/linux/rbtree_latch.h 2017-11-12 19:46:13.000000000 +0100
13748 +++ linux-4.14/include/linux/rbtree_latch.h 2018-09-05 11:05:07.000000000 +0200
13751 #include <linux/rbtree.h>
13752 #include <linux/seqlock.h>
13753 +#include <linux/rcupdate.h>
13755 struct latch_tree_node {
13756 struct rb_node node[2];
13757 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rcu_assign_pointer.h linux-4.14/include/linux/rcu_assign_pointer.h
13758 --- linux-4.14.orig/include/linux/rcu_assign_pointer.h 1970-01-01 01:00:00.000000000 +0100
13759 +++ linux-4.14/include/linux/rcu_assign_pointer.h 2018-09-05 11:05:07.000000000 +0200
13761 +#ifndef __LINUX_RCU_ASSIGN_POINTER_H__
13762 +#define __LINUX_RCU_ASSIGN_POINTER_H__
13763 +#include <linux/compiler.h>
13764 +#include <asm/barrier.h>
13767 + * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
13768 + * @v: The value to statically initialize with.
13770 +#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
13773 + * rcu_assign_pointer() - assign to RCU-protected pointer
13774 + * @p: pointer to assign to
13775 + * @v: value to assign (publish)
13777 + * Assigns the specified value to the specified RCU-protected
13778 + * pointer, ensuring that any concurrent RCU readers will see
13779 + * any prior initialization.
13781 + * Inserts memory barriers on architectures that require them
13782 + * (which is most of them), and also prevents the compiler from
13783 + * reordering the code that initializes the structure after the pointer
13784 + * assignment. More importantly, this call documents which pointers
13785 + * will be dereferenced by RCU read-side code.
13787 + * In some special cases, you may use RCU_INIT_POINTER() instead
13788 + * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due
13789 + * to the fact that it does not constrain either the CPU or the compiler.
13790 + * That said, using RCU_INIT_POINTER() when you should have used
13791 + * rcu_assign_pointer() is a very bad thing that results in
13792 + * impossible-to-diagnose memory corruption. So please be careful.
13793 + * See the RCU_INIT_POINTER() comment header for details.
13795 + * Note that rcu_assign_pointer() evaluates each of its arguments only
13796 + * once, appearances notwithstanding. One of the "extra" evaluations
13797 + * is in typeof() and the other visible only to sparse (__CHECKER__),
13798 + * neither of which actually execute the argument. As with most cpp
13799 + * macros, this execute-arguments-only-once property is important, so
13800 + * please be careful when making changes to rcu_assign_pointer() and the
13801 + * other macros that it invokes.
13803 +#define rcu_assign_pointer(p, v) \
13805 + uintptr_t _r_a_p__v = (uintptr_t)(v); \
13807 + if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \
13808 + WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \
13810 + smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
13815 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rcupdate.h linux-4.14/include/linux/rcupdate.h
13816 --- linux-4.14.orig/include/linux/rcupdate.h 2018-09-05 11:03:22.000000000 +0200
13817 +++ linux-4.14/include/linux/rcupdate.h 2018-09-05 11:05:07.000000000 +0200
13819 #include <linux/lockdep.h>
13820 #include <asm/processor.h>
13821 #include <linux/cpumask.h>
13822 +#include <linux/rcu_assign_pointer.h>
13824 #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
13825 #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
13827 #define call_rcu call_rcu_sched
13828 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
13830 +#ifdef CONFIG_PREEMPT_RT_FULL
13831 +#define call_rcu_bh call_rcu
13833 void call_rcu_bh(struct rcu_head *head, rcu_callback_t func);
13835 void call_rcu_sched(struct rcu_head *head, rcu_callback_t func);
13836 void synchronize_sched(void);
13837 void rcu_barrier_tasks(void);
13839 * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
13841 #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
13842 +#ifndef CONFIG_PREEMPT_RT_FULL
13843 +#define sched_rcu_preempt_depth() rcu_preempt_depth()
13845 +static inline int sched_rcu_preempt_depth(void) { return 0; }
13848 #else /* #ifdef CONFIG_PREEMPT_RCU */
13854 +#define sched_rcu_preempt_depth() rcu_preempt_depth()
13856 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
13858 /* Internal to kernel */
13859 @@ -255,7 +267,14 @@
13860 extern struct lockdep_map rcu_callback_map;
13861 int debug_lockdep_rcu_enabled(void);
13862 int rcu_read_lock_held(void);
13863 +#ifdef CONFIG_PREEMPT_RT_FULL
13864 +static inline int rcu_read_lock_bh_held(void)
13866 + return rcu_read_lock_held();
13869 int rcu_read_lock_bh_held(void);
13871 int rcu_read_lock_sched_held(void);
13873 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
13874 @@ -365,54 +384,6 @@
13878 - * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
13879 - * @v: The value to statically initialize with.
13881 -#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
13884 - * rcu_assign_pointer() - assign to RCU-protected pointer
13885 - * @p: pointer to assign to
13886 - * @v: value to assign (publish)
13888 - * Assigns the specified value to the specified RCU-protected
13889 - * pointer, ensuring that any concurrent RCU readers will see
13890 - * any prior initialization.
13892 - * Inserts memory barriers on architectures that require them
13893 - * (which is most of them), and also prevents the compiler from
13894 - * reordering the code that initializes the structure after the pointer
13895 - * assignment. More importantly, this call documents which pointers
13896 - * will be dereferenced by RCU read-side code.
13898 - * In some special cases, you may use RCU_INIT_POINTER() instead
13899 - * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due
13900 - * to the fact that it does not constrain either the CPU or the compiler.
13901 - * That said, using RCU_INIT_POINTER() when you should have used
13902 - * rcu_assign_pointer() is a very bad thing that results in
13903 - * impossible-to-diagnose memory corruption. So please be careful.
13904 - * See the RCU_INIT_POINTER() comment header for details.
13906 - * Note that rcu_assign_pointer() evaluates each of its arguments only
13907 - * once, appearances notwithstanding. One of the "extra" evaluations
13908 - * is in typeof() and the other visible only to sparse (__CHECKER__),
13909 - * neither of which actually execute the argument. As with most cpp
13910 - * macros, this execute-arguments-only-once property is important, so
13911 - * please be careful when making changes to rcu_assign_pointer() and the
13912 - * other macros that it invokes.
13914 -#define rcu_assign_pointer(p, v) \
13916 - uintptr_t _r_a_p__v = (uintptr_t)(v); \
13918 - if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \
13919 - WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \
13921 - smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
13926 * rcu_swap_protected() - swap an RCU and a regular pointer
13927 * @rcu_ptr: RCU pointer
13928 * @ptr: regular pointer
13929 @@ -707,10 +678,14 @@
13930 static inline void rcu_read_lock_bh(void)
13932 local_bh_disable();
13933 +#ifdef CONFIG_PREEMPT_RT_FULL
13937 rcu_lock_acquire(&rcu_bh_lock_map);
13938 RCU_LOCKDEP_WARN(!rcu_is_watching(),
13939 "rcu_read_lock_bh() used illegally while idle");
13944 @@ -720,10 +695,14 @@
13946 static inline void rcu_read_unlock_bh(void)
13948 +#ifdef CONFIG_PREEMPT_RT_FULL
13949 + rcu_read_unlock();
13951 RCU_LOCKDEP_WARN(!rcu_is_watching(),
13952 "rcu_read_unlock_bh() used illegally while idle");
13953 rcu_lock_release(&rcu_bh_lock_map);
13959 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rcutree.h linux-4.14/include/linux/rcutree.h
13960 --- linux-4.14.orig/include/linux/rcutree.h 2017-11-12 19:46:13.000000000 +0100
13961 +++ linux-4.14/include/linux/rcutree.h 2018-09-05 11:05:07.000000000 +0200
13963 rcu_note_context_switch(false);
13966 +#ifdef CONFIG_PREEMPT_RT_FULL
13967 +# define synchronize_rcu_bh synchronize_rcu
13969 void synchronize_rcu_bh(void);
13971 void synchronize_sched_expedited(void);
13972 void synchronize_rcu_expedited(void);
13977 void rcu_barrier(void);
13978 +#ifdef CONFIG_PREEMPT_RT_FULL
13979 +# define rcu_barrier_bh rcu_barrier
13981 void rcu_barrier_bh(void);
13983 void rcu_barrier_sched(void);
13984 unsigned long get_state_synchronize_rcu(void);
13985 void cond_synchronize_rcu(unsigned long oldstate);
13986 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/ring_buffer.h linux-4.14/include/linux/ring_buffer.h
13987 --- linux-4.14.orig/include/linux/ring_buffer.h 2018-09-05 11:03:22.000000000 +0200
13988 +++ linux-4.14/include/linux/ring_buffer.h 2018-09-05 11:05:07.000000000 +0200
13989 @@ -34,10 +34,12 @@
13990 * array[0] = time delta (28 .. 59)
13993 - * @RINGBUF_TYPE_TIME_STAMP: Sync time stamp with external clock
13994 - * array[0] = tv_nsec
13995 - * array[1..2] = tv_sec
13996 - * size = 16 bytes
13997 + * @RINGBUF_TYPE_TIME_STAMP: Absolute timestamp
13998 + * Same format as TIME_EXTEND except that the
13999 + * value is an absolute timestamp, not a delta
14000 + * event.time_delta contains bottom 27 bits
14001 + * array[0] = top (28 .. 59) bits
14004 * <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX:
14006 @@ -54,12 +56,12 @@
14007 RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28,
14008 RINGBUF_TYPE_PADDING,
14009 RINGBUF_TYPE_TIME_EXTEND,
14010 - /* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */
14011 RINGBUF_TYPE_TIME_STAMP,
14014 unsigned ring_buffer_event_length(struct ring_buffer_event *event);
14015 void *ring_buffer_event_data(struct ring_buffer_event *event);
14016 +u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event);
14019 * ring_buffer_discard_commit will remove an event that has not
14020 @@ -115,6 +117,9 @@
14021 int ring_buffer_write(struct ring_buffer *buffer,
14022 unsigned long length, void *data);
14024 +void ring_buffer_nest_start(struct ring_buffer *buffer);
14025 +void ring_buffer_nest_end(struct ring_buffer *buffer);
14027 struct ring_buffer_event *
14028 ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
14029 unsigned long *lost_events);
14030 @@ -179,6 +184,8 @@
14032 void ring_buffer_set_clock(struct ring_buffer *buffer,
14033 u64 (*clock)(void));
14034 +void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs);
14035 +bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer);
14037 size_t ring_buffer_page_len(void *page);
14039 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rtmutex.h linux-4.14/include/linux/rtmutex.h
14040 --- linux-4.14.orig/include/linux/rtmutex.h 2017-11-12 19:46:13.000000000 +0100
14041 +++ linux-4.14/include/linux/rtmutex.h 2018-09-05 11:05:07.000000000 +0200
14042 @@ -14,11 +14,15 @@
14043 #define __LINUX_RT_MUTEX_H
14045 #include <linux/linkage.h>
14046 +#include <linux/spinlock_types_raw.h>
14047 #include <linux/rbtree.h>
14048 -#include <linux/spinlock_types.h>
14050 extern int max_lock_depth; /* for sysctl */
14052 +#ifdef CONFIG_DEBUG_MUTEXES
14053 +#include <linux/debug_locks.h>
14057 * The rt_mutex structure
14060 raw_spinlock_t wait_lock;
14061 struct rb_root_cached waiters;
14062 struct task_struct *owner;
14063 -#ifdef CONFIG_DEBUG_RT_MUTEXES
14065 +#ifdef CONFIG_DEBUG_RT_MUTEXES
14066 const char *name, *file;
14069 @@ -82,16 +86,23 @@
14070 #define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)
14073 -#define __RT_MUTEX_INITIALIZER(mutexname) \
14074 - { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
14075 +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
14076 + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
14077 , .waiters = RB_ROOT_CACHED \
14079 __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
14080 - __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)}
14081 + __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)
14083 +#define __RT_MUTEX_INITIALIZER(mutexname) \
14084 + { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
14086 #define DEFINE_RT_MUTEX(mutexname) \
14087 struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
14089 +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
14090 + { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
14091 + , .save_state = 1 }
14094 * rt_mutex_is_locked - is the mutex locked
14095 * @lock: the mutex to be queried
14096 @@ -108,6 +119,7 @@
14098 extern void rt_mutex_lock(struct rt_mutex *lock);
14099 extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
14100 +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
14101 extern int rt_mutex_timed_lock(struct rt_mutex *lock,
14102 struct hrtimer_sleeper *timeout);
14104 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rwlock_rt.h linux-4.14/include/linux/rwlock_rt.h
14105 --- linux-4.14.orig/include/linux/rwlock_rt.h 1970-01-01 01:00:00.000000000 +0100
14106 +++ linux-4.14/include/linux/rwlock_rt.h 2018-09-05 11:05:07.000000000 +0200
14108 +#ifndef __LINUX_RWLOCK_RT_H
14109 +#define __LINUX_RWLOCK_RT_H
14111 +#ifndef __LINUX_SPINLOCK_H
14112 +#error Do not include directly. Use spinlock.h
14115 +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
14116 +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
14117 +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
14118 +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
14119 +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
14120 +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
14121 +extern int __lockfunc rt_read_can_lock(rwlock_t *rwlock);
14122 +extern int __lockfunc rt_write_can_lock(rwlock_t *rwlock);
14123 +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
14125 +#define read_can_lock(rwlock) rt_read_can_lock(rwlock)
14126 +#define write_can_lock(rwlock) rt_write_can_lock(rwlock)
14128 +#define read_trylock(lock) __cond_lock(lock, rt_read_trylock(lock))
14129 +#define write_trylock(lock) __cond_lock(lock, rt_write_trylock(lock))
14131 +static inline int __write_trylock_rt_irqsave(rwlock_t *lock, unsigned long *flags)
14133 + /* XXX ARCH_IRQ_ENABLED */
14135 + return rt_write_trylock(lock);
14138 +#define write_trylock_irqsave(lock, flags) \
14139 + __cond_lock(lock, __write_trylock_rt_irqsave(lock, &(flags)))
14141 +#define read_lock_irqsave(lock, flags) \
14143 + typecheck(unsigned long, flags); \
14144 + rt_read_lock(lock); \
14148 +#define write_lock_irqsave(lock, flags) \
14150 + typecheck(unsigned long, flags); \
14151 + rt_write_lock(lock); \
14155 +#define read_lock(lock) rt_read_lock(lock)
14157 +#define read_lock_bh(lock) \
14159 + local_bh_disable(); \
14160 + rt_read_lock(lock); \
14163 +#define read_lock_irq(lock) read_lock(lock)
14165 +#define write_lock(lock) rt_write_lock(lock)
14167 +#define write_lock_bh(lock) \
14169 + local_bh_disable(); \
14170 + rt_write_lock(lock); \
14173 +#define write_lock_irq(lock) write_lock(lock)
14175 +#define read_unlock(lock) rt_read_unlock(lock)
14177 +#define read_unlock_bh(lock) \
14179 + rt_read_unlock(lock); \
14180 + local_bh_enable(); \
14183 +#define read_unlock_irq(lock) read_unlock(lock)
14185 +#define write_unlock(lock) rt_write_unlock(lock)
14187 +#define write_unlock_bh(lock) \
14189 + rt_write_unlock(lock); \
14190 + local_bh_enable(); \
14193 +#define write_unlock_irq(lock) write_unlock(lock)
14195 +#define read_unlock_irqrestore(lock, flags) \
14197 + typecheck(unsigned long, flags); \
14199 + rt_read_unlock(lock); \
14202 +#define write_unlock_irqrestore(lock, flags) \
14204 + typecheck(unsigned long, flags); \
14206 + rt_write_unlock(lock); \
14209 +#define rwlock_init(rwl) \
14211 + static struct lock_class_key __key; \
14213 + __rt_rwlock_init(rwl, #rwl, &__key); \
14217 + * Internal functions made global for CPU pinning
14219 +void __read_rt_lock(struct rt_rw_lock *lock);
14220 +int __read_rt_trylock(struct rt_rw_lock *lock);
14221 +void __write_rt_lock(struct rt_rw_lock *lock);
14222 +int __write_rt_trylock(struct rt_rw_lock *lock);
14223 +void __read_rt_unlock(struct rt_rw_lock *lock);
14224 +void __write_rt_unlock(struct rt_rw_lock *lock);
14227 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rwlock_types.h linux-4.14/include/linux/rwlock_types.h
14228 --- linux-4.14.orig/include/linux/rwlock_types.h 2017-11-12 19:46:13.000000000 +0100
14229 +++ linux-4.14/include/linux/rwlock_types.h 2018-09-05 11:05:07.000000000 +0200
14231 #ifndef __LINUX_RWLOCK_TYPES_H
14232 #define __LINUX_RWLOCK_TYPES_H
14234 +#if !defined(__LINUX_SPINLOCK_TYPES_H)
14235 +# error "Do not include directly, include spinlock_types.h"
14239 * include/linux/rwlock_types.h - generic rwlock type definitions
14241 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rwlock_types_rt.h linux-4.14/include/linux/rwlock_types_rt.h
14242 --- linux-4.14.orig/include/linux/rwlock_types_rt.h 1970-01-01 01:00:00.000000000 +0100
14243 +++ linux-4.14/include/linux/rwlock_types_rt.h 2018-09-05 11:05:07.000000000 +0200
14245 +#ifndef __LINUX_RWLOCK_TYPES_RT_H
14246 +#define __LINUX_RWLOCK_TYPES_RT_H
14248 +#ifndef __LINUX_SPINLOCK_TYPES_H
14249 +#error "Do not include directly. Include spinlock_types.h instead"
14252 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14253 +# define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
14255 +# define RW_DEP_MAP_INIT(lockname)
14258 +typedef struct rt_rw_lock rwlock_t;
14260 +#define __RW_LOCK_UNLOCKED(name) __RWLOCK_RT_INITIALIZER(name)
14262 +#define DEFINE_RWLOCK(name) \
14263 + rwlock_t name = __RW_LOCK_UNLOCKED(name)
14266 + * A reader biased implementation primarily for CPU pinning.
14268 + * Can be selected as general replacement for the single reader RT rwlock
14271 +struct rt_rw_lock {
14272 + struct rt_mutex rtmutex;
14273 + atomic_t readers;
14274 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14275 + struct lockdep_map dep_map;
14279 +#define READER_BIAS (1U << 31)
14280 +#define WRITER_BIAS (1U << 30)
14282 +#define __RWLOCK_RT_INITIALIZER(name) \
14284 + .readers = ATOMIC_INIT(READER_BIAS), \
14285 + .rtmutex = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.rtmutex), \
14286 + RW_DEP_MAP_INIT(name) \
14289 +void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name,
14290 + struct lock_class_key *key);
14292 +#define rwlock_biased_rt_init(rwlock) \
14294 + static struct lock_class_key __key; \
14296 + __rwlock_biased_rt_init((rwlock), #rwlock, &__key); \
14300 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rwsem.h linux-4.14/include/linux/rwsem.h
14301 --- linux-4.14.orig/include/linux/rwsem.h 2018-09-05 11:03:22.000000000 +0200
14302 +++ linux-4.14/include/linux/rwsem.h 2018-09-05 11:05:07.000000000 +0200
14304 #include <linux/osq_lock.h>
14307 +#ifdef CONFIG_PREEMPT_RT_FULL
14308 +#include <linux/rwsem_rt.h>
14309 +#else /* PREEMPT_RT_FULL */
14311 struct rw_semaphore;
14313 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
14314 @@ -114,6 +118,13 @@
14315 return !list_empty(&sem->wait_list);
14318 +#endif /* !PREEMPT_RT_FULL */
14321 + * The functions below are the same for all rwsem implementations including
14322 + * the RT specific variant.
14328 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rwsem_rt.h linux-4.14/include/linux/rwsem_rt.h
14329 --- linux-4.14.orig/include/linux/rwsem_rt.h 1970-01-01 01:00:00.000000000 +0100
14330 +++ linux-4.14/include/linux/rwsem_rt.h 2018-09-05 11:05:07.000000000 +0200
14332 +#ifndef _LINUX_RWSEM_RT_H
14333 +#define _LINUX_RWSEM_RT_H
14335 +#ifndef _LINUX_RWSEM_H
14336 +#error "Include rwsem.h"
14339 +#include <linux/rtmutex.h>
14340 +#include <linux/swait.h>
14342 +#define READER_BIAS (1U << 31)
14343 +#define WRITER_BIAS (1U << 30)
14345 +struct rw_semaphore {
14346 + atomic_t readers;
14347 + struct rt_mutex rtmutex;
14348 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14349 + struct lockdep_map dep_map;
14353 +#define __RWSEM_INITIALIZER(name) \
14355 + .readers = ATOMIC_INIT(READER_BIAS), \
14356 + .rtmutex = __RT_MUTEX_INITIALIZER(name.rtmutex), \
14357 + RW_DEP_MAP_INIT(name) \
14360 +#define DECLARE_RWSEM(lockname) \
14361 + struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
14363 +extern void __rwsem_init(struct rw_semaphore *rwsem, const char *name,
14364 + struct lock_class_key *key);
14366 +#define __init_rwsem(sem, name, key) \
14368 + rt_mutex_init(&(sem)->rtmutex); \
14369 + __rwsem_init((sem), (name), (key)); \
14372 +#define init_rwsem(sem) \
14374 + static struct lock_class_key __key; \
14376 + __init_rwsem((sem), #sem, &__key); \
14379 +static inline int rwsem_is_locked(struct rw_semaphore *sem)
14381 + return atomic_read(&sem->readers) != READER_BIAS;
14384 +static inline int rwsem_is_contended(struct rw_semaphore *sem)
14386 + return atomic_read(&sem->readers) > 0;
14389 +extern void __down_read(struct rw_semaphore *sem);
14390 +extern int __down_read_trylock(struct rw_semaphore *sem);
14391 +extern void __down_write(struct rw_semaphore *sem);
14392 +extern int __must_check __down_write_killable(struct rw_semaphore *sem);
14393 +extern int __down_write_trylock(struct rw_semaphore *sem);
14394 +extern void __up_read(struct rw_semaphore *sem);
14395 +extern void __up_write(struct rw_semaphore *sem);
14396 +extern void __downgrade_write(struct rw_semaphore *sem);
14399 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/sched/mm.h linux-4.14/include/linux/sched/mm.h
14400 --- linux-4.14.orig/include/linux/sched/mm.h 2017-11-12 19:46:13.000000000 +0100
14401 +++ linux-4.14/include/linux/sched/mm.h 2018-09-05 11:05:07.000000000 +0200
14406 +#ifdef CONFIG_PREEMPT_RT_BASE
14407 +extern void __mmdrop_delayed(struct rcu_head *rhp);
14408 +static inline void mmdrop_delayed(struct mm_struct *mm)
14410 + if (atomic_dec_and_test(&mm->mm_count))
14411 + call_rcu(&mm->delayed_drop, __mmdrop_delayed);
14414 +# define mmdrop_delayed(mm) mmdrop(mm)
14417 static inline void mmdrop_async_fn(struct work_struct *work)
14419 struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
14420 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/sched/task.h linux-4.14/include/linux/sched/task.h
14421 --- linux-4.14.orig/include/linux/sched/task.h 2018-09-05 11:03:22.000000000 +0200
14422 +++ linux-4.14/include/linux/sched/task.h 2018-09-05 11:05:07.000000000 +0200
14425 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
14427 +#ifdef CONFIG_PREEMPT_RT_BASE
14428 +extern void __put_task_struct_cb(struct rcu_head *rhp);
14430 +static inline void put_task_struct(struct task_struct *t)
14432 + if (atomic_dec_and_test(&t->usage))
14433 + call_rcu(&t->put_rcu, __put_task_struct_cb);
14436 extern void __put_task_struct(struct task_struct *t);
14438 static inline void put_task_struct(struct task_struct *t)
14440 if (atomic_dec_and_test(&t->usage))
14441 __put_task_struct(t);
14445 struct task_struct *task_rcu_dereference(struct task_struct **ptask);
14447 #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
14448 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/sched/wake_q.h linux-4.14/include/linux/sched/wake_q.h
14449 --- linux-4.14.orig/include/linux/sched/wake_q.h 2017-11-12 19:46:13.000000000 +0100
14450 +++ linux-4.14/include/linux/sched/wake_q.h 2018-09-05 11:05:07.000000000 +0200
14452 head->lastp = &head->first;
14455 -extern void wake_q_add(struct wake_q_head *head,
14456 - struct task_struct *task);
14457 -extern void wake_up_q(struct wake_q_head *head);
14458 +extern void __wake_q_add(struct wake_q_head *head,
14459 + struct task_struct *task, bool sleeper);
14460 +static inline void wake_q_add(struct wake_q_head *head,
14461 + struct task_struct *task)
14463 + __wake_q_add(head, task, false);
14466 +static inline void wake_q_add_sleeper(struct wake_q_head *head,
14467 + struct task_struct *task)
14469 + __wake_q_add(head, task, true);
14472 +extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
14473 +static inline void wake_up_q(struct wake_q_head *head)
14475 + __wake_up_q(head, false);
14478 +static inline void wake_up_q_sleeper(struct wake_q_head *head)
14480 + __wake_up_q(head, true);
14483 #endif /* _LINUX_SCHED_WAKE_Q_H */
14484 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/sched.h linux-4.14/include/linux/sched.h
14485 --- linux-4.14.orig/include/linux/sched.h 2018-09-05 11:03:22.000000000 +0200
14486 +++ linux-4.14/include/linux/sched.h 2018-09-05 11:05:07.000000000 +0200
14488 #include <linux/signal_types.h>
14489 #include <linux/mm_types_task.h>
14490 #include <linux/task_io_accounting.h>
14491 +#include <asm/kmap_types.h>
14493 /* task_struct member predeclarations (sorted alphabetically): */
14494 struct audit_context;
14497 /* Convenience macros for the sake of wake_up(): */
14498 #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
14499 -#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
14501 /* get_task_state(): */
14502 #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
14503 @@ -101,12 +101,8 @@
14504 __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
14507 -#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
14509 #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
14511 -#define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
14513 #define task_contributes_to_load(task) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
14514 (task->flags & PF_FROZEN) == 0 && \
14515 (task->state & TASK_NOLOAD) == 0)
14516 @@ -134,6 +130,11 @@
14517 smp_store_mb(current->state, (state_value)); \
14520 +#define __set_current_state_no_track(state_value) \
14521 + current->state = (state_value);
14522 +#define set_current_state_no_track(state_value) \
14523 + smp_store_mb(current->state, (state_value));
14525 #define set_special_state(state_value) \
14527 unsigned long flags; /* may shadow */ \
14528 @@ -187,6 +188,9 @@
14529 #define set_current_state(state_value) \
14530 smp_store_mb(current->state, (state_value))
14532 +#define __set_current_state_no_track(state_value) __set_current_state(state_value)
14533 +#define set_current_state_no_track(state_value) set_current_state(state_value)
14536 * set_special_state() should be used for those states when the blocking task
14537 * can not use the regular condition based wait-loop. In that case we must
14538 @@ -566,6 +570,8 @@
14540 /* -1 unrunnable, 0 runnable, >0 stopped: */
14541 volatile long state;
14542 + /* saved state for "spinlock sleepers" */
14543 + volatile long saved_state;
14546 * This begins the randomizable portion of task_struct. Only
14547 @@ -618,7 +624,25 @@
14549 unsigned int policy;
14550 int nr_cpus_allowed;
14551 - cpumask_t cpus_allowed;
14552 + const cpumask_t *cpus_ptr;
14553 + cpumask_t cpus_mask;
14554 +#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
14555 + int migrate_disable;
14556 + int migrate_disable_update;
14557 + int pinned_on_cpu;
14558 +# ifdef CONFIG_SCHED_DEBUG
14559 + int migrate_disable_atomic;
14562 +#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
14563 + int migrate_disable;
14564 +# ifdef CONFIG_SCHED_DEBUG
14565 + int migrate_disable_atomic;
14568 +#ifdef CONFIG_PREEMPT_RT_FULL
14569 + int sleeping_lock;
14572 #ifdef CONFIG_PREEMPT_RCU
14573 int rcu_read_lock_nesting;
14574 @@ -777,6 +801,9 @@
14575 #ifdef CONFIG_POSIX_TIMERS
14576 struct task_cputime cputime_expires;
14577 struct list_head cpu_timers[3];
14578 +#ifdef CONFIG_PREEMPT_RT_BASE
14579 + struct task_struct *posix_timer_list;
14583 /* Process credentials: */
14584 @@ -820,11 +847,17 @@
14585 /* Signal handlers: */
14586 struct signal_struct *signal;
14587 struct sighand_struct *sighand;
14588 + struct sigqueue *sigqueue_cache;
14591 sigset_t real_blocked;
14592 /* Restored if set_restore_sigmask() was used: */
14593 sigset_t saved_sigmask;
14594 struct sigpending pending;
14595 +#ifdef CONFIG_PREEMPT_RT_FULL
14596 + /* TODO: move me into ->restart_block ? */
14597 + struct siginfo forced_info;
14599 unsigned long sas_ss_sp;
14600 size_t sas_ss_size;
14601 unsigned int sas_ss_flags;
14602 @@ -849,6 +882,7 @@
14603 raw_spinlock_t pi_lock;
14605 struct wake_q_node wake_q;
14606 + struct wake_q_node wake_q_sleeper;
14608 #ifdef CONFIG_RT_MUTEXES
14609 /* PI waiters blocked on a rt_mutex held by this task: */
14610 @@ -1116,9 +1150,23 @@
14611 unsigned int sequential_io;
14612 unsigned int sequential_io_avg;
14614 +#ifdef CONFIG_PREEMPT_RT_BASE
14615 + struct rcu_head put_rcu;
14616 + int softirq_nestcnt;
14617 + unsigned int softirqs_raised;
14619 +#ifdef CONFIG_PREEMPT_RT_FULL
14620 +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
14622 + pte_t kmap_pte[KM_TYPE_NR];
14625 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
14626 unsigned long task_state_change;
14628 +#ifdef CONFIG_PREEMPT_RT_FULL
14629 + int xmit_recursion;
14631 int pagefault_disabled;
14633 struct task_struct *oom_reaper_list;
14634 @@ -1332,6 +1380,7 @@
14636 * Per process flags
14638 +#define PF_IN_SOFTIRQ 0x00000001 /* Task is serving softirq */
14639 #define PF_IDLE 0x00000002 /* I am an IDLE thread */
14640 #define PF_EXITING 0x00000004 /* Getting shut down */
14641 #define PF_EXITPIDONE 0x00000008 /* PI exit done on shut down */
14642 @@ -1355,7 +1404,7 @@
14643 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */
14644 #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
14645 #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
14646 -#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
14647 +#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
14648 #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
14649 #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
14650 #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
14651 @@ -1535,6 +1584,7 @@
14653 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
14654 extern int wake_up_process(struct task_struct *tsk);
14655 +extern int wake_up_lock_sleeper(struct task_struct *tsk);
14656 extern void wake_up_new_task(struct task_struct *tsk);
14659 @@ -1611,6 +1661,89 @@
14660 return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
14663 +#ifdef CONFIG_PREEMPT_LAZY
14664 +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
14666 + set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
14669 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
14671 + clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
14674 +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
14676 + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
14679 +static inline int need_resched_lazy(void)
14681 + return test_thread_flag(TIF_NEED_RESCHED_LAZY);
14684 +static inline int need_resched_now(void)
14686 + return test_thread_flag(TIF_NEED_RESCHED);
14690 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
14691 +static inline int need_resched_lazy(void) { return 0; }
14693 +static inline int need_resched_now(void)
14695 + return test_thread_flag(TIF_NEED_RESCHED);
14701 +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
14703 + if (task->state & (__TASK_STOPPED | __TASK_TRACED))
14705 +#ifdef CONFIG_PREEMPT_RT_FULL
14706 + if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
14712 +static inline bool task_is_stopped_or_traced(struct task_struct *task)
14714 + bool traced_stopped;
14716 +#ifdef CONFIG_PREEMPT_RT_FULL
14717 + unsigned long flags;
14719 + raw_spin_lock_irqsave(&task->pi_lock, flags);
14720 + traced_stopped = __task_is_stopped_or_traced(task);
14721 + raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14723 + traced_stopped = __task_is_stopped_or_traced(task);
14725 + return traced_stopped;
14728 +static inline bool task_is_traced(struct task_struct *task)
14730 + bool traced = false;
14732 + if (task->state & __TASK_TRACED)
14734 +#ifdef CONFIG_PREEMPT_RT_FULL
14735 + /* in case the task is sleeping on tasklist_lock */
14736 + raw_spin_lock_irq(&task->pi_lock);
14737 + if (task->state & __TASK_TRACED)
14739 + else if (task->saved_state & __TASK_TRACED)
14741 + raw_spin_unlock_irq(&task->pi_lock);
14747 * cond_resched() and cond_resched_lock(): latency reduction via
14748 * explicit rescheduling in places that are safe. The return
14749 @@ -1636,12 +1769,16 @@
14750 __cond_resched_lock(lock); \
14753 +#ifndef CONFIG_PREEMPT_RT_FULL
14754 extern int __cond_resched_softirq(void);
14756 #define cond_resched_softirq() ({ \
14757 ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
14758 __cond_resched_softirq(); \
14761 +# define cond_resched_softirq() cond_resched()
14764 static inline void cond_resched_rcu(void)
14766 @@ -1671,6 +1808,23 @@
14767 return unlikely(tif_need_resched());
14770 +#ifdef CONFIG_PREEMPT_RT_FULL
14771 +static inline void sleeping_lock_inc(void)
14773 + current->sleeping_lock++;
14776 +static inline void sleeping_lock_dec(void)
14778 + current->sleeping_lock--;
14783 +static inline void sleeping_lock_inc(void) { }
14784 +static inline void sleeping_lock_dec(void) { }
14788 * Wrappers for p->thread_info->cpu access. No-op on UP.
14790 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/seqlock.h linux-4.14/include/linux/seqlock.h
14791 --- linux-4.14.orig/include/linux/seqlock.h 2017-11-12 19:46:13.000000000 +0100
14792 +++ linux-4.14/include/linux/seqlock.h 2018-09-05 11:05:07.000000000 +0200
14793 @@ -221,20 +221,30 @@
14794 return __read_seqcount_retry(s, start);
14799 -static inline void raw_write_seqcount_begin(seqcount_t *s)
14800 +static inline void __raw_write_seqcount_begin(seqcount_t *s)
14806 -static inline void raw_write_seqcount_end(seqcount_t *s)
14807 +static inline void raw_write_seqcount_begin(seqcount_t *s)
14809 + preempt_disable_rt();
14810 + __raw_write_seqcount_begin(s);
14813 +static inline void __raw_write_seqcount_end(seqcount_t *s)
14819 +static inline void raw_write_seqcount_end(seqcount_t *s)
14821 + __raw_write_seqcount_end(s);
14822 + preempt_enable_rt();
14826 * raw_write_seqcount_barrier - do a seq write barrier
14827 * @s: pointer to seqcount_t
14828 @@ -429,10 +439,32 @@
14830 * Read side functions for starting and finalizing a read side section.
14832 +#ifndef CONFIG_PREEMPT_RT_FULL
14833 static inline unsigned read_seqbegin(const seqlock_t *sl)
14835 return read_seqcount_begin(&sl->seqcount);
14839 + * Starvation safe read side for RT
14841 +static inline unsigned read_seqbegin(seqlock_t *sl)
14846 + ret = ACCESS_ONCE(sl->seqcount.sequence);
14847 + if (unlikely(ret & 1)) {
14849 + * Take the lock and let the writer proceed (i.e. evtl
14850 + * boost it), otherwise we could loop here forever.
14852 + spin_unlock_wait(&sl->lock);
14859 static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
14861 @@ -447,36 +479,45 @@
14862 static inline void write_seqlock(seqlock_t *sl)
14864 spin_lock(&sl->lock);
14865 - write_seqcount_begin(&sl->seqcount);
14866 + __raw_write_seqcount_begin(&sl->seqcount);
14869 +static inline int try_write_seqlock(seqlock_t *sl)
14871 + if (spin_trylock(&sl->lock)) {
14872 + __raw_write_seqcount_begin(&sl->seqcount);
14878 static inline void write_sequnlock(seqlock_t *sl)
14880 - write_seqcount_end(&sl->seqcount);
14881 + __raw_write_seqcount_end(&sl->seqcount);
14882 spin_unlock(&sl->lock);
14885 static inline void write_seqlock_bh(seqlock_t *sl)
14887 spin_lock_bh(&sl->lock);
14888 - write_seqcount_begin(&sl->seqcount);
14889 + __raw_write_seqcount_begin(&sl->seqcount);
14892 static inline void write_sequnlock_bh(seqlock_t *sl)
14894 - write_seqcount_end(&sl->seqcount);
14895 + __raw_write_seqcount_end(&sl->seqcount);
14896 spin_unlock_bh(&sl->lock);
14899 static inline void write_seqlock_irq(seqlock_t *sl)
14901 spin_lock_irq(&sl->lock);
14902 - write_seqcount_begin(&sl->seqcount);
14903 + __raw_write_seqcount_begin(&sl->seqcount);
14906 static inline void write_sequnlock_irq(seqlock_t *sl)
14908 - write_seqcount_end(&sl->seqcount);
14909 + __raw_write_seqcount_end(&sl->seqcount);
14910 spin_unlock_irq(&sl->lock);
14913 @@ -485,7 +526,7 @@
14914 unsigned long flags;
14916 spin_lock_irqsave(&sl->lock, flags);
14917 - write_seqcount_begin(&sl->seqcount);
14918 + __raw_write_seqcount_begin(&sl->seqcount);
14922 @@ -495,7 +536,7 @@
14924 write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
14926 - write_seqcount_end(&sl->seqcount);
14927 + __raw_write_seqcount_end(&sl->seqcount);
14928 spin_unlock_irqrestore(&sl->lock, flags);
14931 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/signal.h linux-4.14/include/linux/signal.h
14932 --- linux-4.14.orig/include/linux/signal.h 2017-11-12 19:46:13.000000000 +0100
14933 +++ linux-4.14/include/linux/signal.h 2018-09-05 11:05:07.000000000 +0200
14934 @@ -243,6 +243,7 @@
14937 extern void flush_sigqueue(struct sigpending *queue);
14938 +extern void flush_task_sigqueue(struct task_struct *tsk);
14940 /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
14941 static inline int valid_signal(unsigned long sig)
14942 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/skbuff.h linux-4.14/include/linux/skbuff.h
14943 --- linux-4.14.orig/include/linux/skbuff.h 2018-09-05 11:03:22.000000000 +0200
14944 +++ linux-4.14/include/linux/skbuff.h 2018-09-05 11:05:07.000000000 +0200
14945 @@ -287,6 +287,7 @@
14949 + raw_spinlock_t raw_lock;
14953 @@ -1667,6 +1668,12 @@
14954 __skb_queue_head_init(list);
14957 +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
14959 + raw_spin_lock_init(&list->raw_lock);
14960 + __skb_queue_head_init(list);
14963 static inline void skb_queue_head_init_class(struct sk_buff_head *list,
14964 struct lock_class_key *class)
14966 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/smp.h linux-4.14/include/linux/smp.h
14967 --- linux-4.14.orig/include/linux/smp.h 2017-11-12 19:46:13.000000000 +0100
14968 +++ linux-4.14/include/linux/smp.h 2018-09-05 11:05:07.000000000 +0200
14969 @@ -202,6 +202,9 @@
14970 #define get_cpu() ({ preempt_disable(); smp_processor_id(); })
14971 #define put_cpu() preempt_enable()
14973 +#define get_cpu_light() ({ migrate_disable(); smp_processor_id(); })
14974 +#define put_cpu_light() migrate_enable()
14977 * Callback to arch code if there's nosmp or maxcpus=0 on the
14978 * boot command line:
14979 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_api_smp.h linux-4.14/include/linux/spinlock_api_smp.h
14980 --- linux-4.14.orig/include/linux/spinlock_api_smp.h 2017-11-12 19:46:13.000000000 +0100
14981 +++ linux-4.14/include/linux/spinlock_api_smp.h 2018-09-05 11:05:07.000000000 +0200
14982 @@ -187,6 +187,8 @@
14986 -#include <linux/rwlock_api_smp.h>
14987 +#ifndef CONFIG_PREEMPT_RT_FULL
14988 +# include <linux/rwlock_api_smp.h>
14991 #endif /* __LINUX_SPINLOCK_API_SMP_H */
14992 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock.h linux-4.14/include/linux/spinlock.h
14993 --- linux-4.14.orig/include/linux/spinlock.h 2017-11-12 19:46:13.000000000 +0100
14994 +++ linux-4.14/include/linux/spinlock.h 2018-09-05 11:05:07.000000000 +0200
14995 @@ -286,7 +286,11 @@
14996 #define raw_spin_can_lock(lock) (!raw_spin_is_locked(lock))
14998 /* Include rwlock functions */
14999 -#include <linux/rwlock.h>
15000 +#ifdef CONFIG_PREEMPT_RT_FULL
15001 +# include <linux/rwlock_rt.h>
15003 +# include <linux/rwlock.h>
15007 * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
15008 @@ -297,6 +301,10 @@
15009 # include <linux/spinlock_api_up.h>
15012 +#ifdef CONFIG_PREEMPT_RT_FULL
15013 +# include <linux/spinlock_rt.h>
15014 +#else /* PREEMPT_RT_FULL */
15017 * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
15019 @@ -421,4 +429,6 @@
15020 #define atomic_dec_and_lock(atomic, lock) \
15021 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
15023 +#endif /* !PREEMPT_RT_FULL */
15025 #endif /* __LINUX_SPINLOCK_H */
15026 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_rt.h linux-4.14/include/linux/spinlock_rt.h
15027 --- linux-4.14.orig/include/linux/spinlock_rt.h 1970-01-01 01:00:00.000000000 +0100
15028 +++ linux-4.14/include/linux/spinlock_rt.h 2018-09-05 11:05:07.000000000 +0200
15030 +#ifndef __LINUX_SPINLOCK_RT_H
15031 +#define __LINUX_SPINLOCK_RT_H
15033 +#ifndef __LINUX_SPINLOCK_H
15034 +#error Do not include directly. Use spinlock.h
15037 +#include <linux/bug.h>
15040 +__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key);
15042 +#define spin_lock_init(slock) \
15044 + static struct lock_class_key __key; \
15046 + rt_mutex_init(&(slock)->lock); \
15047 + __rt_spin_lock_init(slock, #slock, &__key); \
15050 +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
15051 +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
15052 +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
15053 +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
15054 +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
15055 +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
15056 +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
15057 +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
15058 +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
15061 + * lockdep-less calls, for derived types like rwlock:
15062 + * (for trylock they can use rt_mutex_trylock() directly.
15063 + * Migrate disable handling must be done at the call site.
15065 +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
15066 +extern void __lockfunc __rt_spin_trylock(struct rt_mutex *lock);
15067 +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
15069 +#define spin_lock(lock) rt_spin_lock(lock)
15071 +#define spin_lock_bh(lock) \
15073 + local_bh_disable(); \
15074 + rt_spin_lock(lock); \
15077 +#define spin_lock_irq(lock) spin_lock(lock)
15079 +#define spin_do_trylock(lock) __cond_lock(lock, rt_spin_trylock(lock))
15081 +#define spin_trylock(lock) \
15084 + __locked = spin_do_trylock(lock); \
15088 +#ifdef CONFIG_LOCKDEP
15089 +# define spin_lock_nested(lock, subclass) \
15091 + rt_spin_lock_nested(lock, subclass); \
15094 +#define spin_lock_bh_nested(lock, subclass) \
15096 + local_bh_disable(); \
15097 + rt_spin_lock_nested(lock, subclass); \
15100 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
15102 + typecheck(unsigned long, flags); \
15104 + rt_spin_lock_nested(lock, subclass); \
15107 +# define spin_lock_nested(lock, subclass) spin_lock(lock)
15108 +# define spin_lock_bh_nested(lock, subclass) spin_lock_bh(lock)
15110 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
15112 + typecheck(unsigned long, flags); \
15114 + spin_lock(lock); \
15118 +#define spin_lock_irqsave(lock, flags) \
15120 + typecheck(unsigned long, flags); \
15122 + spin_lock(lock); \
15125 +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
15127 + unsigned long flags = 0;
15128 +#ifdef CONFIG_TRACE_IRQFLAGS
15129 + flags = rt_spin_lock_trace_flags(lock);
15131 + spin_lock(lock); /* lock_local */
15136 +/* FIXME: we need rt_spin_lock_nest_lock */
15137 +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
15139 +#define spin_unlock(lock) rt_spin_unlock(lock)
15141 +#define spin_unlock_bh(lock) \
15143 + rt_spin_unlock(lock); \
15144 + local_bh_enable(); \
15147 +#define spin_unlock_irq(lock) spin_unlock(lock)
15149 +#define spin_unlock_irqrestore(lock, flags) \
15151 + typecheck(unsigned long, flags); \
15153 + spin_unlock(lock); \
15156 +#define spin_trylock_bh(lock) __cond_lock(lock, rt_spin_trylock_bh(lock))
15157 +#define spin_trylock_irq(lock) spin_trylock(lock)
15159 +#define spin_trylock_irqsave(lock, flags) \
15160 + rt_spin_trylock_irqsave(lock, &(flags))
15162 +#define spin_unlock_wait(lock) rt_spin_unlock_wait(lock)
15164 +#ifdef CONFIG_GENERIC_LOCKBREAK
15165 +# define spin_is_contended(lock) ((lock)->break_lock)
15167 +# define spin_is_contended(lock) (((void)(lock), 0))
15170 +static inline int spin_can_lock(spinlock_t *lock)
15172 + return !rt_mutex_is_locked(&lock->lock);
15175 +static inline int spin_is_locked(spinlock_t *lock)
15177 + return rt_mutex_is_locked(&lock->lock);
15180 +static inline void assert_spin_locked(spinlock_t *lock)
15182 + BUG_ON(!spin_is_locked(lock));
15185 +#define atomic_dec_and_lock(atomic, lock) \
15186 + atomic_dec_and_spin_lock(atomic, lock)
15189 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_types.h linux-4.14/include/linux/spinlock_types.h
15190 --- linux-4.14.orig/include/linux/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
15191 +++ linux-4.14/include/linux/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
15193 * Released under the General Public License (GPL).
15196 -#if defined(CONFIG_SMP)
15197 -# include <asm/spinlock_types.h>
15199 -# include <linux/spinlock_types_up.h>
15202 -#include <linux/lockdep.h>
15204 -typedef struct raw_spinlock {
15205 - arch_spinlock_t raw_lock;
15206 -#ifdef CONFIG_GENERIC_LOCKBREAK
15207 - unsigned int break_lock;
15209 -#ifdef CONFIG_DEBUG_SPINLOCK
15210 - unsigned int magic, owner_cpu;
15213 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15214 - struct lockdep_map dep_map;
15218 -#define SPINLOCK_MAGIC 0xdead4ead
15220 -#define SPINLOCK_OWNER_INIT ((void *)-1L)
15222 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15223 -# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
15225 -# define SPIN_DEP_MAP_INIT(lockname)
15227 +#include <linux/spinlock_types_raw.h>
15229 -#ifdef CONFIG_DEBUG_SPINLOCK
15230 -# define SPIN_DEBUG_INIT(lockname) \
15231 - .magic = SPINLOCK_MAGIC, \
15232 - .owner_cpu = -1, \
15233 - .owner = SPINLOCK_OWNER_INIT,
15234 +#ifndef CONFIG_PREEMPT_RT_FULL
15235 +# include <linux/spinlock_types_nort.h>
15236 +# include <linux/rwlock_types.h>
15238 -# define SPIN_DEBUG_INIT(lockname)
15241 -#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \
15243 - .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \
15244 - SPIN_DEBUG_INIT(lockname) \
15245 - SPIN_DEP_MAP_INIT(lockname) }
15247 -#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \
15248 - (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15250 -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15252 -typedef struct spinlock {
15254 - struct raw_spinlock rlock;
15256 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15257 -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15259 - u8 __padding[LOCK_PADSIZE];
15260 - struct lockdep_map dep_map;
15262 +# include <linux/rtmutex.h>
15263 +# include <linux/spinlock_types_rt.h>
15264 +# include <linux/rwlock_types_rt.h>
15269 -#define __SPIN_LOCK_INITIALIZER(lockname) \
15270 - { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15272 -#define __SPIN_LOCK_UNLOCKED(lockname) \
15273 - (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15275 -#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15277 -#include <linux/rwlock_types.h>
15279 #endif /* __LINUX_SPINLOCK_TYPES_H */
15280 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_types_nort.h linux-4.14/include/linux/spinlock_types_nort.h
15281 --- linux-4.14.orig/include/linux/spinlock_types_nort.h 1970-01-01 01:00:00.000000000 +0100
15282 +++ linux-4.14/include/linux/spinlock_types_nort.h 2018-09-05 11:05:07.000000000 +0200
15284 +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
15285 +#define __LINUX_SPINLOCK_TYPES_NORT_H
15287 +#ifndef __LINUX_SPINLOCK_TYPES_H
15288 +#error "Do not include directly. Include spinlock_types.h instead"
15292 + * The non RT version maps spinlocks to raw_spinlocks
15294 +typedef struct spinlock {
15296 + struct raw_spinlock rlock;
15298 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15299 +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15301 + u8 __padding[LOCK_PADSIZE];
15302 + struct lockdep_map dep_map;
15308 +#define __SPIN_LOCK_INITIALIZER(lockname) \
15309 + { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15311 +#define __SPIN_LOCK_UNLOCKED(lockname) \
15312 + (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15314 +#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15317 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_types_raw.h linux-4.14/include/linux/spinlock_types_raw.h
15318 --- linux-4.14.orig/include/linux/spinlock_types_raw.h 1970-01-01 01:00:00.000000000 +0100
15319 +++ linux-4.14/include/linux/spinlock_types_raw.h 2018-09-05 11:05:07.000000000 +0200
15321 +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
15322 +#define __LINUX_SPINLOCK_TYPES_RAW_H
15324 +#include <linux/types.h>
15326 +#if defined(CONFIG_SMP)
15327 +# include <asm/spinlock_types.h>
15329 +# include <linux/spinlock_types_up.h>
15332 +#include <linux/lockdep.h>
15334 +typedef struct raw_spinlock {
15335 + arch_spinlock_t raw_lock;
15336 +#ifdef CONFIG_GENERIC_LOCKBREAK
15337 + unsigned int break_lock;
15339 +#ifdef CONFIG_DEBUG_SPINLOCK
15340 + unsigned int magic, owner_cpu;
15343 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15344 + struct lockdep_map dep_map;
15348 +#define SPINLOCK_MAGIC 0xdead4ead
15350 +#define SPINLOCK_OWNER_INIT ((void *)-1L)
15352 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15353 +# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
15355 +# define SPIN_DEP_MAP_INIT(lockname)
15358 +#ifdef CONFIG_DEBUG_SPINLOCK
15359 +# define SPIN_DEBUG_INIT(lockname) \
15360 + .magic = SPINLOCK_MAGIC, \
15361 + .owner_cpu = -1, \
15362 + .owner = SPINLOCK_OWNER_INIT,
15364 +# define SPIN_DEBUG_INIT(lockname)
15367 +#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \
15369 + .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \
15370 + SPIN_DEBUG_INIT(lockname) \
15371 + SPIN_DEP_MAP_INIT(lockname) }
15373 +#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \
15374 + (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15376 +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15379 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_types_rt.h linux-4.14/include/linux/spinlock_types_rt.h
15380 --- linux-4.14.orig/include/linux/spinlock_types_rt.h 1970-01-01 01:00:00.000000000 +0100
15381 +++ linux-4.14/include/linux/spinlock_types_rt.h 2018-09-05 11:05:07.000000000 +0200
15383 +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
15384 +#define __LINUX_SPINLOCK_TYPES_RT_H
15386 +#ifndef __LINUX_SPINLOCK_TYPES_H
15387 +#error "Do not include directly. Include spinlock_types.h instead"
15390 +#include <linux/cache.h>
15393 + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
15395 +typedef struct spinlock {
15396 + struct rt_mutex lock;
15397 + unsigned int break_lock;
15398 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15399 + struct lockdep_map dep_map;
15403 +#ifdef CONFIG_DEBUG_RT_MUTEXES
15404 +# define __RT_SPIN_INITIALIZER(name) \
15406 + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
15407 + .save_state = 1, \
15408 + .file = __FILE__, \
15409 + .line = __LINE__ , \
15412 +# define __RT_SPIN_INITIALIZER(name) \
15414 + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
15415 + .save_state = 1, \
15420 +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
15423 +#define __SPIN_LOCK_UNLOCKED(name) \
15424 + { .lock = __RT_SPIN_INITIALIZER(name.lock), \
15425 + SPIN_DEP_MAP_INIT(name) }
15427 +#define DEFINE_SPINLOCK(name) \
15428 + spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
15431 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_types_up.h linux-4.14/include/linux/spinlock_types_up.h
15432 --- linux-4.14.orig/include/linux/spinlock_types_up.h 2017-11-12 19:46:13.000000000 +0100
15433 +++ linux-4.14/include/linux/spinlock_types_up.h 2018-09-05 11:05:07.000000000 +0200
15435 #ifndef __LINUX_SPINLOCK_TYPES_UP_H
15436 #define __LINUX_SPINLOCK_TYPES_UP_H
15438 -#ifndef __LINUX_SPINLOCK_TYPES_H
15439 -# error "please don't include this file directly"
15443 * include/linux/spinlock_types_up.h - spinlock type definitions for UP
15445 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/srcutiny.h linux-4.14/include/linux/srcutiny.h
15446 --- linux-4.14.orig/include/linux/srcutiny.h 2017-11-12 19:46:13.000000000 +0100
15447 +++ linux-4.14/include/linux/srcutiny.h 2018-09-05 11:05:07.000000000 +0200
15450 void srcu_drive_gp(struct work_struct *wp);
15452 -#define __SRCU_STRUCT_INIT(name) \
15453 +#define __SRCU_STRUCT_INIT(name, __ignored) \
15455 .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \
15456 .srcu_cb_tail = &name.srcu_cb_head, \
15458 * Tree SRCU, which needs some per-CPU data.
15460 #define DEFINE_SRCU(name) \
15461 - struct srcu_struct name = __SRCU_STRUCT_INIT(name)
15462 + struct srcu_struct name = __SRCU_STRUCT_INIT(name, name)
15463 #define DEFINE_STATIC_SRCU(name) \
15464 - static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
15465 + static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name)
15467 void synchronize_srcu(struct srcu_struct *sp);
15469 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/srcutree.h linux-4.14/include/linux/srcutree.h
15470 --- linux-4.14.orig/include/linux/srcutree.h 2017-11-12 19:46:13.000000000 +0100
15471 +++ linux-4.14/include/linux/srcutree.h 2018-09-05 11:05:07.000000000 +0200
15473 unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */
15475 /* Update-side state. */
15476 - raw_spinlock_t __private lock ____cacheline_internodealigned_in_smp;
15477 + spinlock_t __private lock ____cacheline_internodealigned_in_smp;
15478 struct rcu_segcblist srcu_cblist; /* List of callbacks.*/
15479 unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */
15480 unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */
15482 * Node in SRCU combining tree, similar in function to rcu_data.
15485 - raw_spinlock_t __private lock;
15486 + spinlock_t __private lock;
15487 unsigned long srcu_have_cbs[4]; /* GP seq for children */
15488 /* having CBs, but only */
15489 /* is > ->srcu_gq_seq. */
15491 struct srcu_node *level[RCU_NUM_LVLS + 1];
15492 /* First node at each level. */
15493 struct mutex srcu_cb_mutex; /* Serialize CB preparation. */
15494 - raw_spinlock_t __private lock; /* Protect counters */
15495 + spinlock_t __private lock; /* Protect counters */
15496 struct mutex srcu_gp_mutex; /* Serialize GP work. */
15497 unsigned int srcu_idx; /* Current rdr array element. */
15498 unsigned long srcu_gp_seq; /* Grace-period seq #. */
15499 @@ -104,10 +104,10 @@
15500 #define SRCU_STATE_SCAN1 1
15501 #define SRCU_STATE_SCAN2 2
15503 -#define __SRCU_STRUCT_INIT(name) \
15504 +#define __SRCU_STRUCT_INIT(name, pcpu_name) \
15506 - .sda = &name##_srcu_data, \
15507 - .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \
15508 + .sda = &pcpu_name, \
15509 + .lock = __SPIN_LOCK_UNLOCKED(name.lock), \
15510 .srcu_gp_seq_needed = 0 - 1, \
15511 __SRCU_DEP_MAP_INIT(name) \
15513 @@ -133,7 +133,7 @@
15515 #define __DEFINE_SRCU(name, is_static) \
15516 static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\
15517 - is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
15518 + is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_data)
15519 #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */)
15520 #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static)
15522 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/suspend.h linux-4.14/include/linux/suspend.h
15523 --- linux-4.14.orig/include/linux/suspend.h 2018-09-05 11:03:22.000000000 +0200
15524 +++ linux-4.14/include/linux/suspend.h 2018-09-05 11:05:07.000000000 +0200
15525 @@ -196,6 +196,12 @@
15529 +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
15530 +extern bool pm_in_action;
15532 +# define pm_in_action false
15535 #ifdef CONFIG_SUSPEND
15536 extern suspend_state_t mem_sleep_current;
15537 extern suspend_state_t mem_sleep_default;
15538 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/swait.h linux-4.14/include/linux/swait.h
15539 --- linux-4.14.orig/include/linux/swait.h 2017-11-12 19:46:13.000000000 +0100
15540 +++ linux-4.14/include/linux/swait.h 2018-09-05 11:05:07.000000000 +0200
15542 #include <linux/list.h>
15543 #include <linux/stddef.h>
15544 #include <linux/spinlock.h>
15545 +#include <linux/wait.h>
15546 #include <asm/current.h>
15549 @@ -147,6 +148,7 @@
15550 extern void swake_up(struct swait_queue_head *q);
15551 extern void swake_up_all(struct swait_queue_head *q);
15552 extern void swake_up_locked(struct swait_queue_head *q);
15553 +extern void swake_up_all_locked(struct swait_queue_head *q);
15555 extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
15556 extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
15557 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/swap.h linux-4.14/include/linux/swap.h
15558 --- linux-4.14.orig/include/linux/swap.h 2017-11-12 19:46:13.000000000 +0100
15559 +++ linux-4.14/include/linux/swap.h 2018-09-05 11:05:07.000000000 +0200
15561 #include <linux/fs.h>
15562 #include <linux/atomic.h>
15563 #include <linux/page-flags.h>
15564 +#include <linux/locallock.h>
15565 #include <asm/page.h>
15567 struct notifier_block;
15568 @@ -297,7 +298,8 @@
15569 void *workingset_eviction(struct address_space *mapping, struct page *page);
15570 bool workingset_refault(void *shadow);
15571 void workingset_activation(struct page *page);
15572 -void workingset_update_node(struct radix_tree_node *node, void *private);
15573 +void __workingset_update_node(struct radix_tree_node *node, void *private);
15574 +DECLARE_LOCAL_IRQ_LOCK(shadow_nodes_lock);
15576 /* linux/mm/page_alloc.c */
15577 extern unsigned long totalram_pages;
15578 @@ -310,6 +312,7 @@
15581 /* linux/mm/swap.c */
15582 +DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
15583 extern void lru_cache_add(struct page *);
15584 extern void lru_cache_add_anon(struct page *page);
15585 extern void lru_cache_add_file(struct page *page);
15586 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/swork.h linux-4.14/include/linux/swork.h
15587 --- linux-4.14.orig/include/linux/swork.h 1970-01-01 01:00:00.000000000 +0100
15588 +++ linux-4.14/include/linux/swork.h 2018-09-05 11:05:07.000000000 +0200
15590 +#ifndef _LINUX_SWORK_H
15591 +#define _LINUX_SWORK_H
15593 +#include <linux/list.h>
15595 +struct swork_event {
15596 + struct list_head item;
15597 + unsigned long flags;
15598 + void (*func)(struct swork_event *);
15601 +static inline void INIT_SWORK(struct swork_event *event,
15602 + void (*func)(struct swork_event *))
15604 + event->flags = 0;
15605 + event->func = func;
15608 +bool swork_queue(struct swork_event *sev);
15610 +int swork_get(void);
15611 +void swork_put(void);
15613 +#endif /* _LINUX_SWORK_H */
15614 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/thread_info.h linux-4.14/include/linux/thread_info.h
15615 --- linux-4.14.orig/include/linux/thread_info.h 2018-09-05 11:03:22.000000000 +0200
15616 +++ linux-4.14/include/linux/thread_info.h 2018-09-05 11:05:07.000000000 +0200
15618 #define test_thread_flag(flag) \
15619 test_ti_thread_flag(current_thread_info(), flag)
15621 -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
15622 +#ifdef CONFIG_PREEMPT_LAZY
15623 +#define tif_need_resched() (test_thread_flag(TIF_NEED_RESCHED) || \
15624 + test_thread_flag(TIF_NEED_RESCHED_LAZY))
15625 +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
15626 +#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY))
15629 +#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
15630 +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
15631 +#define tif_need_resched_lazy() 0
15634 #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
15635 static inline int arch_within_stack_frames(const void * const stack,
15636 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/timer.h linux-4.14/include/linux/timer.h
15637 --- linux-4.14.orig/include/linux/timer.h 2018-09-05 11:03:22.000000000 +0200
15638 +++ linux-4.14/include/linux/timer.h 2018-09-05 11:05:07.000000000 +0200
15639 @@ -213,7 +213,7 @@
15641 extern int try_to_del_timer_sync(struct timer_list *timer);
15644 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
15645 extern int del_timer_sync(struct timer_list *timer);
15647 # define del_timer_sync(t) del_timer(t)
15648 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/trace_events.h linux-4.14/include/linux/trace_events.h
15649 --- linux-4.14.orig/include/linux/trace_events.h 2017-11-12 19:46:13.000000000 +0100
15650 +++ linux-4.14/include/linux/trace_events.h 2018-09-05 11:05:07.000000000 +0200
15652 unsigned char flags;
15653 unsigned char preempt_count;
15655 + unsigned short migrate_disable;
15656 + unsigned short padding;
15657 + unsigned char preempt_lazy_count;
15660 #define TRACE_EVENT_TYPE_MAX \
15661 @@ -402,11 +405,13 @@
15663 extern int filter_match_preds(struct event_filter *filter, void *rec);
15665 -extern enum event_trigger_type event_triggers_call(struct trace_event_file *file,
15667 -extern void event_triggers_post_call(struct trace_event_file *file,
15668 - enum event_trigger_type tt,
15670 +extern enum event_trigger_type
15671 +event_triggers_call(struct trace_event_file *file, void *rec,
15672 + struct ring_buffer_event *event);
15674 +event_triggers_post_call(struct trace_event_file *file,
15675 + enum event_trigger_type tt,
15676 + void *rec, struct ring_buffer_event *event);
15678 bool trace_event_ignore_this_pid(struct trace_event_file *trace_file);
15680 @@ -426,7 +431,7 @@
15682 if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) {
15683 if (eflags & EVENT_FILE_FL_TRIGGER_MODE)
15684 - event_triggers_call(file, NULL);
15685 + event_triggers_call(file, NULL, NULL);
15686 if (eflags & EVENT_FILE_FL_SOFT_DISABLED)
15688 if (eflags & EVENT_FILE_FL_PID_FILTER)
15689 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/uaccess.h linux-4.14/include/linux/uaccess.h
15690 --- linux-4.14.orig/include/linux/uaccess.h 2017-11-12 19:46:13.000000000 +0100
15691 +++ linux-4.14/include/linux/uaccess.h 2018-09-05 11:05:07.000000000 +0200
15692 @@ -185,6 +185,7 @@
15694 static inline void pagefault_disable(void)
15696 + migrate_disable();
15697 pagefault_disabled_inc();
15699 * make sure to have issued the store before a pagefault
15700 @@ -201,6 +202,7 @@
15703 pagefault_disabled_dec();
15704 + migrate_enable();
15708 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/vmstat.h linux-4.14/include/linux/vmstat.h
15709 --- linux-4.14.orig/include/linux/vmstat.h 2017-11-12 19:46:13.000000000 +0100
15710 +++ linux-4.14/include/linux/vmstat.h 2018-09-05 11:05:07.000000000 +0200
15713 static inline void __count_vm_event(enum vm_event_item item)
15715 + preempt_disable_rt();
15716 raw_cpu_inc(vm_event_states.event[item]);
15717 + preempt_enable_rt();
15720 static inline void count_vm_event(enum vm_event_item item)
15723 static inline void __count_vm_events(enum vm_event_item item, long delta)
15725 + preempt_disable_rt();
15726 raw_cpu_add(vm_event_states.event[item], delta);
15727 + preempt_enable_rt();
15730 static inline void count_vm_events(enum vm_event_item item, long delta)
15731 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/wait.h linux-4.14/include/linux/wait.h
15732 --- linux-4.14.orig/include/linux/wait.h 2017-11-12 19:46:13.000000000 +0100
15733 +++ linux-4.14/include/linux/wait.h 2018-09-05 11:05:07.000000000 +0200
15736 #include <asm/current.h>
15737 #include <uapi/linux/wait.h>
15738 +#include <linux/atomic.h>
15740 typedef struct wait_queue_entry wait_queue_entry_t;
15742 @@ -486,8 +487,8 @@
15744 struct hrtimer_sleeper __t; \
15746 - hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); \
15747 - hrtimer_init_sleeper(&__t, current); \
15748 + hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC, HRTIMER_MODE_REL, \
15750 if ((timeout) != KTIME_MAX) \
15751 hrtimer_start_range_ns(&__t.timer, timeout, \
15752 current->timer_slack_ns, \
15753 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/net/gen_stats.h linux-4.14/include/net/gen_stats.h
15754 --- linux-4.14.orig/include/net/gen_stats.h 2017-11-12 19:46:13.000000000 +0100
15755 +++ linux-4.14/include/net/gen_stats.h 2018-09-05 11:05:07.000000000 +0200
15757 #include <linux/socket.h>
15758 #include <linux/rtnetlink.h>
15759 #include <linux/pkt_sched.h>
15760 +#include <net/net_seq_lock.h>
15762 struct gnet_stats_basic_cpu {
15763 struct gnet_stats_basic_packed bstats;
15764 @@ -36,11 +37,11 @@
15765 spinlock_t *lock, struct gnet_dump *d,
15768 -int gnet_stats_copy_basic(const seqcount_t *running,
15769 +int gnet_stats_copy_basic(net_seqlock_t *running,
15770 struct gnet_dump *d,
15771 struct gnet_stats_basic_cpu __percpu *cpu,
15772 struct gnet_stats_basic_packed *b);
15773 -void __gnet_stats_copy_basic(const seqcount_t *running,
15774 +void __gnet_stats_copy_basic(net_seqlock_t *running,
15775 struct gnet_stats_basic_packed *bstats,
15776 struct gnet_stats_basic_cpu __percpu *cpu,
15777 struct gnet_stats_basic_packed *b);
15778 @@ -57,13 +58,13 @@
15779 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
15780 struct net_rate_estimator __rcu **rate_est,
15781 spinlock_t *stats_lock,
15782 - seqcount_t *running, struct nlattr *opt);
15783 + net_seqlock_t *running, struct nlattr *opt);
15784 void gen_kill_estimator(struct net_rate_estimator __rcu **ptr);
15785 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
15786 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
15787 struct net_rate_estimator __rcu **ptr,
15788 spinlock_t *stats_lock,
15789 - seqcount_t *running, struct nlattr *opt);
15790 + net_seqlock_t *running, struct nlattr *opt);
15791 bool gen_estimator_active(struct net_rate_estimator __rcu **ptr);
15792 bool gen_estimator_read(struct net_rate_estimator __rcu **ptr,
15793 struct gnet_stats_rate_est64 *sample);
15794 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/net/neighbour.h linux-4.14/include/net/neighbour.h
15795 --- linux-4.14.orig/include/net/neighbour.h 2017-11-12 19:46:13.000000000 +0100
15796 +++ linux-4.14/include/net/neighbour.h 2018-09-05 11:05:07.000000000 +0200
15797 @@ -450,7 +450,7 @@
15801 -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
15802 +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
15805 unsigned int hh_len;
15806 @@ -474,7 +474,7 @@
15808 static inline int neigh_output(struct neighbour *n, struct sk_buff *skb)
15810 - const struct hh_cache *hh = &n->hh;
15811 + struct hh_cache *hh = &n->hh;
15813 if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
15814 return neigh_hh_output(hh, skb);
15815 @@ -515,7 +515,7 @@
15817 #define NEIGH_CB(skb) ((struct neighbour_cb *)(skb)->cb)
15819 -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
15820 +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
15821 const struct net_device *dev)
15824 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/net/net_seq_lock.h linux-4.14/include/net/net_seq_lock.h
15825 --- linux-4.14.orig/include/net/net_seq_lock.h 1970-01-01 01:00:00.000000000 +0100
15826 +++ linux-4.14/include/net/net_seq_lock.h 2018-09-05 11:05:07.000000000 +0200
15828 +#ifndef __NET_NET_SEQ_LOCK_H__
15829 +#define __NET_NET_SEQ_LOCK_H__
15831 +#ifdef CONFIG_PREEMPT_RT_BASE
15832 +# define net_seqlock_t seqlock_t
15833 +# define net_seq_begin(__r) read_seqbegin(__r)
15834 +# define net_seq_retry(__r, __s) read_seqretry(__r, __s)
15837 +# define net_seqlock_t seqcount_t
15838 +# define net_seq_begin(__r) read_seqcount_begin(__r)
15839 +# define net_seq_retry(__r, __s) read_seqcount_retry(__r, __s)
15843 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/net/sch_generic.h linux-4.14/include/net/sch_generic.h
15844 --- linux-4.14.orig/include/net/sch_generic.h 2018-09-05 11:03:22.000000000 +0200
15845 +++ linux-4.14/include/net/sch_generic.h 2018-09-05 11:05:07.000000000 +0200
15847 #include <linux/percpu.h>
15848 #include <linux/dynamic_queue_limits.h>
15849 #include <linux/list.h>
15850 +#include <net/net_seq_lock.h>
15851 #include <linux/refcount.h>
15852 #include <linux/workqueue.h>
15853 #include <net/gen_stats.h>
15855 struct sk_buff *gso_skb ____cacheline_aligned_in_smp;
15856 struct qdisc_skb_head q;
15857 struct gnet_stats_basic_packed bstats;
15858 - seqcount_t running;
15859 + net_seqlock_t running;
15860 struct gnet_stats_queue qstats;
15861 unsigned long state;
15862 struct Qdisc *next_sched;
15863 @@ -109,13 +110,22 @@
15864 refcount_inc(&qdisc->refcnt);
15867 -static inline bool qdisc_is_running(const struct Qdisc *qdisc)
15868 +static inline bool qdisc_is_running(struct Qdisc *qdisc)
15870 +#ifdef CONFIG_PREEMPT_RT_BASE
15871 + return spin_is_locked(&qdisc->running.lock) ? true : false;
15873 return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
15877 static inline bool qdisc_run_begin(struct Qdisc *qdisc)
15879 +#ifdef CONFIG_PREEMPT_RT_BASE
15880 + if (try_write_seqlock(&qdisc->running))
15884 if (qdisc_is_running(qdisc))
15886 /* Variant of write_seqcount_begin() telling lockdep a trylock
15887 @@ -124,11 +134,16 @@
15888 raw_write_seqcount_begin(&qdisc->running);
15889 seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
15894 static inline void qdisc_run_end(struct Qdisc *qdisc)
15896 +#ifdef CONFIG_PREEMPT_RT_BASE
15897 + write_sequnlock(&qdisc->running);
15899 write_seqcount_end(&qdisc->running);
15903 static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
15904 @@ -337,7 +352,7 @@
15905 return qdisc_lock(root);
15908 -static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
15909 +static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
15911 struct Qdisc *root = qdisc_root_sleeping(qdisc);
15913 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/net/xfrm.h linux-4.14/include/net/xfrm.h
15914 --- linux-4.14.orig/include/net/xfrm.h 2018-09-05 11:03:22.000000000 +0200
15915 +++ linux-4.14/include/net/xfrm.h 2018-09-05 11:05:07.000000000 +0200
15916 @@ -217,7 +217,7 @@
15917 struct xfrm_stats stats;
15919 struct xfrm_lifetime_cur curlft;
15920 - struct tasklet_hrtimer mtimer;
15921 + struct hrtimer mtimer;
15923 struct xfrm_state_offload xso;
15925 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/trace/events/timer.h linux-4.14/include/trace/events/timer.h
15926 --- linux-4.14.orig/include/trace/events/timer.h 2018-09-05 11:03:22.000000000 +0200
15927 +++ linux-4.14/include/trace/events/timer.h 2018-09-05 11:05:07.000000000 +0200
15928 @@ -148,7 +148,11 @@
15929 { HRTIMER_MODE_ABS, "ABS" }, \
15930 { HRTIMER_MODE_REL, "REL" }, \
15931 { HRTIMER_MODE_ABS_PINNED, "ABS|PINNED" }, \
15932 - { HRTIMER_MODE_REL_PINNED, "REL|PINNED" })
15933 + { HRTIMER_MODE_REL_PINNED, "REL|PINNED" }, \
15934 + { HRTIMER_MODE_ABS_SOFT, "ABS|SOFT" }, \
15935 + { HRTIMER_MODE_REL_SOFT, "REL|SOFT" }, \
15936 + { HRTIMER_MODE_ABS_PINNED_SOFT, "ABS|PINNED|SOFT" }, \
15937 + { HRTIMER_MODE_REL_PINNED_SOFT, "REL|PINNED|SOFT" })
15940 * hrtimer_init - called when the hrtimer is initialized
15941 @@ -186,15 +190,16 @@
15943 TRACE_EVENT(hrtimer_start,
15945 - TP_PROTO(struct hrtimer *hrtimer),
15946 + TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode),
15948 - TP_ARGS(hrtimer),
15949 + TP_ARGS(hrtimer, mode),
15952 __field( void *, hrtimer )
15953 __field( void *, function )
15954 __field( s64, expires )
15955 __field( s64, softexpires )
15956 + __field( enum hrtimer_mode, mode )
15960 @@ -202,12 +207,14 @@
15961 __entry->function = hrtimer->function;
15962 __entry->expires = hrtimer_get_expires(hrtimer);
15963 __entry->softexpires = hrtimer_get_softexpires(hrtimer);
15964 + __entry->mode = mode;
15967 - TP_printk("hrtimer=%p function=%pf expires=%llu softexpires=%llu",
15968 - __entry->hrtimer, __entry->function,
15969 + TP_printk("hrtimer=%p function=%pf expires=%llu softexpires=%llu "
15970 + "mode=%s", __entry->hrtimer, __entry->function,
15971 (unsigned long long) __entry->expires,
15972 - (unsigned long long) __entry->softexpires)
15973 + (unsigned long long) __entry->softexpires,
15974 + decode_hrtimer_mode(__entry->mode))
15978 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/init/Kconfig linux-4.14/init/Kconfig
15979 --- linux-4.14.orig/init/Kconfig 2018-09-05 11:03:22.000000000 +0200
15980 +++ linux-4.14/init/Kconfig 2018-09-05 11:05:07.000000000 +0200
15981 @@ -744,6 +744,7 @@
15982 config RT_GROUP_SCHED
15983 bool "Group scheduling for SCHED_RR/FIFO"
15984 depends on CGROUP_SCHED
15985 + depends on !PREEMPT_RT_FULL
15988 This feature lets you explicitly allocate real CPU bandwidth
15989 @@ -1533,6 +1534,7 @@
15993 + depends on !PREEMPT_RT_FULL
15994 select HAVE_HARDENED_USERCOPY_ALLOCATOR
15996 The regular slab allocator that is established and known to work
15997 @@ -1553,6 +1555,7 @@
16000 bool "SLOB (Simple Allocator)"
16001 + depends on !PREEMPT_RT_FULL
16003 SLOB replaces the stock allocator with a drastically simpler
16004 allocator. SLOB is generally more space efficient but
16005 @@ -1594,7 +1597,7 @@
16007 config SLUB_CPU_PARTIAL
16009 - depends on SLUB && SMP
16010 + depends on SLUB && SMP && !PREEMPT_RT_FULL
16011 bool "SLUB per cpu partial cache"
16013 Per cpu partial caches accellerate objects allocation and freeing
16014 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/init/main.c linux-4.14/init/main.c
16015 --- linux-4.14.orig/init/main.c 2018-09-05 11:03:22.000000000 +0200
16016 +++ linux-4.14/init/main.c 2018-09-05 11:05:07.000000000 +0200
16017 @@ -543,6 +543,7 @@
16018 setup_command_line(command_line);
16019 setup_nr_cpu_ids();
16020 setup_per_cpu_areas();
16021 + softirq_early_init();
16022 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
16023 boot_cpu_hotplug_init();
16025 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/init/Makefile linux-4.14/init/Makefile
16026 --- linux-4.14.orig/init/Makefile 2017-11-12 19:46:13.000000000 +0100
16027 +++ linux-4.14/init/Makefile 2018-09-05 11:05:07.000000000 +0200
16029 include/generated/compile.h: FORCE
16030 @$($(quiet)chk_compile.h)
16031 $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
16032 - "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
16033 + "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
16034 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/cgroup/cgroup.c linux-4.14/kernel/cgroup/cgroup.c
16035 --- linux-4.14.orig/kernel/cgroup/cgroup.c 2018-09-05 11:03:22.000000000 +0200
16036 +++ linux-4.14/kernel/cgroup/cgroup.c 2018-09-05 11:05:07.000000000 +0200
16037 @@ -4508,10 +4508,10 @@
16038 queue_work(cgroup_destroy_wq, &css->destroy_work);
16041 -static void css_release_work_fn(struct work_struct *work)
16042 +static void css_release_work_fn(struct swork_event *sev)
16044 struct cgroup_subsys_state *css =
16045 - container_of(work, struct cgroup_subsys_state, destroy_work);
16046 + container_of(sev, struct cgroup_subsys_state, destroy_swork);
16047 struct cgroup_subsys *ss = css->ss;
16048 struct cgroup *cgrp = css->cgroup;
16050 @@ -4562,8 +4562,8 @@
16051 struct cgroup_subsys_state *css =
16052 container_of(ref, struct cgroup_subsys_state, refcnt);
16054 - INIT_WORK(&css->destroy_work, css_release_work_fn);
16055 - queue_work(cgroup_destroy_wq, &css->destroy_work);
16056 + INIT_SWORK(&css->destroy_swork, css_release_work_fn);
16057 + swork_queue(&css->destroy_swork);
16060 static void init_and_link_css(struct cgroup_subsys_state *css,
16061 @@ -5269,6 +5269,7 @@
16063 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
16064 BUG_ON(!cgroup_destroy_wq);
16065 + BUG_ON(swork_get());
16068 core_initcall(cgroup_wq_init);
16069 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/cgroup/cpuset.c linux-4.14/kernel/cgroup/cpuset.c
16070 --- linux-4.14.orig/kernel/cgroup/cpuset.c 2017-11-12 19:46:13.000000000 +0100
16071 +++ linux-4.14/kernel/cgroup/cpuset.c 2018-09-05 11:05:07.000000000 +0200
16072 @@ -288,7 +288,7 @@
16075 static DEFINE_MUTEX(cpuset_mutex);
16076 -static DEFINE_SPINLOCK(callback_lock);
16077 +static DEFINE_RAW_SPINLOCK(callback_lock);
16079 static struct workqueue_struct *cpuset_migrate_mm_wq;
16081 @@ -926,9 +926,9 @@
16085 - spin_lock_irq(&callback_lock);
16086 + raw_spin_lock_irq(&callback_lock);
16087 cpumask_copy(cp->effective_cpus, new_cpus);
16088 - spin_unlock_irq(&callback_lock);
16089 + raw_spin_unlock_irq(&callback_lock);
16091 WARN_ON(!is_in_v2_mode() &&
16092 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
16093 @@ -993,9 +993,9 @@
16097 - spin_lock_irq(&callback_lock);
16098 + raw_spin_lock_irq(&callback_lock);
16099 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
16100 - spin_unlock_irq(&callback_lock);
16101 + raw_spin_unlock_irq(&callback_lock);
16103 /* use trialcs->cpus_allowed as a temp variable */
16104 update_cpumasks_hier(cs, trialcs->cpus_allowed);
16105 @@ -1179,9 +1179,9 @@
16109 - spin_lock_irq(&callback_lock);
16110 + raw_spin_lock_irq(&callback_lock);
16111 cp->effective_mems = *new_mems;
16112 - spin_unlock_irq(&callback_lock);
16113 + raw_spin_unlock_irq(&callback_lock);
16115 WARN_ON(!is_in_v2_mode() &&
16116 !nodes_equal(cp->mems_allowed, cp->effective_mems));
16117 @@ -1249,9 +1249,9 @@
16121 - spin_lock_irq(&callback_lock);
16122 + raw_spin_lock_irq(&callback_lock);
16123 cs->mems_allowed = trialcs->mems_allowed;
16124 - spin_unlock_irq(&callback_lock);
16125 + raw_spin_unlock_irq(&callback_lock);
16127 /* use trialcs->mems_allowed as a temp variable */
16128 update_nodemasks_hier(cs, &trialcs->mems_allowed);
16129 @@ -1342,9 +1342,9 @@
16130 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
16131 || (is_spread_page(cs) != is_spread_page(trialcs)));
16133 - spin_lock_irq(&callback_lock);
16134 + raw_spin_lock_irq(&callback_lock);
16135 cs->flags = trialcs->flags;
16136 - spin_unlock_irq(&callback_lock);
16137 + raw_spin_unlock_irq(&callback_lock);
16139 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
16140 rebuild_sched_domains_locked();
16141 @@ -1759,7 +1759,7 @@
16142 cpuset_filetype_t type = seq_cft(sf)->private;
16145 - spin_lock_irq(&callback_lock);
16146 + raw_spin_lock_irq(&callback_lock);
16150 @@ -1778,7 +1778,7 @@
16154 - spin_unlock_irq(&callback_lock);
16155 + raw_spin_unlock_irq(&callback_lock);
16159 @@ -1993,12 +1993,12 @@
16163 - spin_lock_irq(&callback_lock);
16164 + raw_spin_lock_irq(&callback_lock);
16165 if (is_in_v2_mode()) {
16166 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
16167 cs->effective_mems = parent->effective_mems;
16169 - spin_unlock_irq(&callback_lock);
16170 + raw_spin_unlock_irq(&callback_lock);
16172 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
16174 @@ -2025,12 +2025,12 @@
16178 - spin_lock_irq(&callback_lock);
16179 + raw_spin_lock_irq(&callback_lock);
16180 cs->mems_allowed = parent->mems_allowed;
16181 cs->effective_mems = parent->mems_allowed;
16182 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
16183 cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
16184 - spin_unlock_irq(&callback_lock);
16185 + raw_spin_unlock_irq(&callback_lock);
16187 mutex_unlock(&cpuset_mutex);
16189 @@ -2069,7 +2069,7 @@
16190 static void cpuset_bind(struct cgroup_subsys_state *root_css)
16192 mutex_lock(&cpuset_mutex);
16193 - spin_lock_irq(&callback_lock);
16194 + raw_spin_lock_irq(&callback_lock);
16196 if (is_in_v2_mode()) {
16197 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
16198 @@ -2080,7 +2080,7 @@
16199 top_cpuset.mems_allowed = top_cpuset.effective_mems;
16202 - spin_unlock_irq(&callback_lock);
16203 + raw_spin_unlock_irq(&callback_lock);
16204 mutex_unlock(&cpuset_mutex);
16207 @@ -2094,7 +2094,7 @@
16208 if (task_css_is_root(task, cpuset_cgrp_id))
16211 - set_cpus_allowed_ptr(task, ¤t->cpus_allowed);
16212 + set_cpus_allowed_ptr(task, current->cpus_ptr);
16213 task->mems_allowed = current->mems_allowed;
16216 @@ -2178,12 +2178,12 @@
16220 - spin_lock_irq(&callback_lock);
16221 + raw_spin_lock_irq(&callback_lock);
16222 cpumask_copy(cs->cpus_allowed, new_cpus);
16223 cpumask_copy(cs->effective_cpus, new_cpus);
16224 cs->mems_allowed = *new_mems;
16225 cs->effective_mems = *new_mems;
16226 - spin_unlock_irq(&callback_lock);
16227 + raw_spin_unlock_irq(&callback_lock);
16230 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
16231 @@ -2220,10 +2220,10 @@
16232 if (nodes_empty(*new_mems))
16233 *new_mems = parent_cs(cs)->effective_mems;
16235 - spin_lock_irq(&callback_lock);
16236 + raw_spin_lock_irq(&callback_lock);
16237 cpumask_copy(cs->effective_cpus, new_cpus);
16238 cs->effective_mems = *new_mems;
16239 - spin_unlock_irq(&callback_lock);
16240 + raw_spin_unlock_irq(&callback_lock);
16243 update_tasks_cpumask(cs);
16244 @@ -2316,21 +2316,21 @@
16246 /* synchronize cpus_allowed to cpu_active_mask */
16247 if (cpus_updated) {
16248 - spin_lock_irq(&callback_lock);
16249 + raw_spin_lock_irq(&callback_lock);
16251 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
16252 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
16253 - spin_unlock_irq(&callback_lock);
16254 + raw_spin_unlock_irq(&callback_lock);
16255 /* we don't mess with cpumasks of tasks in top_cpuset */
16258 /* synchronize mems_allowed to N_MEMORY */
16259 if (mems_updated) {
16260 - spin_lock_irq(&callback_lock);
16261 + raw_spin_lock_irq(&callback_lock);
16263 top_cpuset.mems_allowed = new_mems;
16264 top_cpuset.effective_mems = new_mems;
16265 - spin_unlock_irq(&callback_lock);
16266 + raw_spin_unlock_irq(&callback_lock);
16267 update_tasks_nodemask(&top_cpuset);
16270 @@ -2429,11 +2429,11 @@
16272 unsigned long flags;
16274 - spin_lock_irqsave(&callback_lock, flags);
16275 + raw_spin_lock_irqsave(&callback_lock, flags);
16277 guarantee_online_cpus(task_cs(tsk), pmask);
16279 - spin_unlock_irqrestore(&callback_lock, flags);
16280 + raw_spin_unlock_irqrestore(&callback_lock, flags);
16283 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
16284 @@ -2481,11 +2481,11 @@
16286 unsigned long flags;
16288 - spin_lock_irqsave(&callback_lock, flags);
16289 + raw_spin_lock_irqsave(&callback_lock, flags);
16291 guarantee_online_mems(task_cs(tsk), &mask);
16293 - spin_unlock_irqrestore(&callback_lock, flags);
16294 + raw_spin_unlock_irqrestore(&callback_lock, flags);
16298 @@ -2577,14 +2577,14 @@
16301 /* Not hardwall and node outside mems_allowed: scan up cpusets */
16302 - spin_lock_irqsave(&callback_lock, flags);
16303 + raw_spin_lock_irqsave(&callback_lock, flags);
16306 cs = nearest_hardwall_ancestor(task_cs(current));
16307 allowed = node_isset(node, cs->mems_allowed);
16310 - spin_unlock_irqrestore(&callback_lock, flags);
16311 + raw_spin_unlock_irqrestore(&callback_lock, flags);
16315 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/cpu.c linux-4.14/kernel/cpu.c
16316 --- linux-4.14.orig/kernel/cpu.c 2018-09-05 11:03:22.000000000 +0200
16317 +++ linux-4.14/kernel/cpu.c 2018-09-05 11:05:07.000000000 +0200
16319 .fail = CPUHP_INVALID,
16322 +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PREEMPT_RT_FULL)
16323 +static DEFINE_PER_CPU(struct rt_rw_lock, cpuhp_pin_lock) = \
16324 + __RWLOCK_RT_INITIALIZER(cpuhp_pin_lock);
16327 #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
16328 static struct lockdep_map cpuhp_state_up_map =
16329 STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
16330 @@ -287,6 +292,55 @@
16332 #ifdef CONFIG_HOTPLUG_CPU
16335 + * pin_current_cpu - Prevent the current cpu from being unplugged
16337 +void pin_current_cpu(void)
16339 +#ifdef CONFIG_PREEMPT_RT_FULL
16340 + struct rt_rw_lock *cpuhp_pin;
16341 + unsigned int cpu;
16345 + cpuhp_pin = this_cpu_ptr(&cpuhp_pin_lock);
16346 + ret = __read_rt_trylock(cpuhp_pin);
16348 + current->pinned_on_cpu = smp_processor_id();
16351 + cpu = smp_processor_id();
16352 + preempt_lazy_enable();
16353 + preempt_enable();
16355 + __read_rt_lock(cpuhp_pin);
16357 + preempt_disable();
16358 + preempt_lazy_disable();
16359 + if (cpu != smp_processor_id()) {
16360 + __read_rt_unlock(cpuhp_pin);
16363 + current->pinned_on_cpu = cpu;
16368 + * unpin_current_cpu - Allow unplug of current cpu
16370 +void unpin_current_cpu(void)
16372 +#ifdef CONFIG_PREEMPT_RT_FULL
16373 + struct rt_rw_lock *cpuhp_pin = this_cpu_ptr(&cpuhp_pin_lock);
16375 + if (WARN_ON(current->pinned_on_cpu != smp_processor_id()))
16376 + cpuhp_pin = per_cpu_ptr(&cpuhp_pin_lock, current->pinned_on_cpu);
16378 + current->pinned_on_cpu = -1;
16379 + __read_rt_unlock(cpuhp_pin);
16383 DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
16385 void cpus_read_lock(void)
16386 @@ -843,6 +897,9 @@
16388 static int takedown_cpu(unsigned int cpu)
16390 +#ifdef CONFIG_PREEMPT_RT_FULL
16391 + struct rt_rw_lock *cpuhp_pin = per_cpu_ptr(&cpuhp_pin_lock, cpu);
16393 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
16396 @@ -855,11 +912,18 @@
16400 +#ifdef CONFIG_PREEMPT_RT_FULL
16401 + __write_rt_lock(cpuhp_pin);
16405 * So now all preempt/rcu users must observe !cpu_active().
16407 err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
16409 +#ifdef CONFIG_PREEMPT_RT_FULL
16410 + __write_rt_unlock(cpuhp_pin);
16412 /* CPU refused to die */
16413 irq_unlock_sparse();
16414 /* Unpark the hotplug thread so we can rollback there */
16415 @@ -878,6 +942,9 @@
16416 wait_for_ap_thread(st, false);
16417 BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
16419 +#ifdef CONFIG_PREEMPT_RT_FULL
16420 + __write_rt_unlock(cpuhp_pin);
16422 /* Interrupts are moved away from the dying cpu, reenable alloc/free */
16423 irq_unlock_sparse();
16425 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/debug/kdb/kdb_io.c linux-4.14/kernel/debug/kdb/kdb_io.c
16426 --- linux-4.14.orig/kernel/debug/kdb/kdb_io.c 2018-09-05 11:03:22.000000000 +0200
16427 +++ linux-4.14/kernel/debug/kdb/kdb_io.c 2018-09-05 11:05:07.000000000 +0200
16428 @@ -854,9 +854,11 @@
16432 + kdb_trap_printk++;
16434 r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
16436 + kdb_trap_printk--;
16440 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/events/core.c linux-4.14/kernel/events/core.c
16441 --- linux-4.14.orig/kernel/events/core.c 2018-09-05 11:03:22.000000000 +0200
16442 +++ linux-4.14/kernel/events/core.c 2018-09-05 11:05:07.000000000 +0200
16443 @@ -1065,7 +1065,7 @@
16444 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
16446 raw_spin_lock_init(&cpuctx->hrtimer_lock);
16447 - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
16448 + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
16449 timer->function = perf_mux_hrtimer_handler;
16452 @@ -8750,7 +8750,7 @@
16453 if (!is_sampling_event(event))
16456 - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
16457 + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
16458 hwc->hrtimer.function = perf_swevent_hrtimer;
16461 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/exit.c linux-4.14/kernel/exit.c
16462 --- linux-4.14.orig/kernel/exit.c 2018-09-05 11:03:22.000000000 +0200
16463 +++ linux-4.14/kernel/exit.c 2018-09-05 11:05:07.000000000 +0200
16464 @@ -159,7 +159,7 @@
16465 * Do this under ->siglock, we can race with another thread
16466 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
16468 - flush_sigqueue(&tsk->pending);
16469 + flush_task_sigqueue(tsk);
16470 tsk->sighand = NULL;
16471 spin_unlock(&sighand->siglock);
16473 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/fork.c linux-4.14/kernel/fork.c
16474 --- linux-4.14.orig/kernel/fork.c 2018-09-05 11:03:28.000000000 +0200
16475 +++ linux-4.14/kernel/fork.c 2018-09-05 11:05:07.000000000 +0200
16477 #include <linux/hmm.h>
16478 #include <linux/fs.h>
16479 #include <linux/mm.h>
16480 +#include <linux/kprobes.h>
16481 #include <linux/vmacache.h>
16482 #include <linux/nsproxy.h>
16483 #include <linux/capability.h>
16484 @@ -407,13 +408,24 @@
16485 if (atomic_dec_and_test(&sig->sigcnt))
16486 free_signal_struct(sig);
16489 +#ifdef CONFIG_PREEMPT_RT_BASE
16492 void __put_task_struct(struct task_struct *tsk)
16494 WARN_ON(!tsk->exit_state);
16495 WARN_ON(atomic_read(&tsk->usage));
16496 WARN_ON(tsk == current);
16499 + * Remove function-return probe instances associated with this
16500 + * task and put them back on the free list.
16502 + kprobe_flush_task(tsk);
16504 + /* Task is done with its stack. */
16505 + put_task_stack(tsk);
16508 task_numa_free(tsk);
16509 security_task_free(tsk);
16510 @@ -424,7 +436,18 @@
16511 if (!profile_handoff_task(tsk))
16514 +#ifndef CONFIG_PREEMPT_RT_BASE
16515 EXPORT_SYMBOL_GPL(__put_task_struct);
16517 +void __put_task_struct_cb(struct rcu_head *rhp)
16519 + struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
16521 + __put_task_struct(tsk);
16524 +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
16527 void __init __weak arch_task_cache_init(void) { }
16529 @@ -563,7 +586,8 @@
16530 #ifdef CONFIG_CC_STACKPROTECTOR
16531 tsk->stack_canary = get_random_canary();
16534 + if (orig->cpus_ptr == &orig->cpus_mask)
16535 + tsk->cpus_ptr = &tsk->cpus_mask;
16537 * One for us, one for whoever does the "release_task()" (usually
16539 @@ -575,6 +599,7 @@
16540 tsk->splice_pipe = NULL;
16541 tsk->task_frag.page = NULL;
16542 tsk->wake_q.next = NULL;
16543 + tsk->wake_q_sleeper.next = NULL;
16545 account_kernel_stack(tsk, 1);
16547 @@ -915,6 +940,19 @@
16549 EXPORT_SYMBOL_GPL(__mmdrop);
16551 +#ifdef CONFIG_PREEMPT_RT_BASE
16553 + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
16554 + * want another facility to make this work.
16556 +void __mmdrop_delayed(struct rcu_head *rhp)
16558 + struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
16564 static inline void __mmput(struct mm_struct *mm)
16566 VM_BUG_ON(atomic_read(&mm->mm_users));
16567 @@ -1494,6 +1532,9 @@
16569 static void posix_cpu_timers_init(struct task_struct *tsk)
16571 +#ifdef CONFIG_PREEMPT_RT_BASE
16572 + tsk->posix_timer_list = NULL;
16574 tsk->cputime_expires.prof_exp = 0;
16575 tsk->cputime_expires.virt_exp = 0;
16576 tsk->cputime_expires.sched_exp = 0;
16577 @@ -1646,6 +1687,7 @@
16578 spin_lock_init(&p->alloc_lock);
16580 init_sigpending(&p->pending);
16581 + p->sigqueue_cache = NULL;
16583 p->utime = p->stime = p->gtime = 0;
16584 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
16585 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/futex.c linux-4.14/kernel/futex.c
16586 --- linux-4.14.orig/kernel/futex.c 2018-09-05 11:03:22.000000000 +0200
16587 +++ linux-4.14/kernel/futex.c 2018-09-05 11:05:07.000000000 +0200
16588 @@ -936,7 +936,9 @@
16589 if (head->next != next) {
16590 /* retain curr->pi_lock for the loop invariant */
16591 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
16592 + raw_spin_unlock_irq(&curr->pi_lock);
16593 spin_unlock(&hb->lock);
16594 + raw_spin_lock_irq(&curr->pi_lock);
16595 put_pi_state(pi_state);
16598 @@ -1430,6 +1432,7 @@
16599 struct task_struct *new_owner;
16600 bool postunlock = false;
16601 DEFINE_WAKE_Q(wake_q);
16602 + DEFINE_WAKE_Q(wake_sleeper_q);
16605 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
16606 @@ -1491,13 +1494,13 @@
16607 pi_state->owner = new_owner;
16608 raw_spin_unlock(&new_owner->pi_lock);
16610 - postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
16612 + postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
16613 + &wake_sleeper_q);
16615 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
16618 - rt_mutex_postunlock(&wake_q);
16619 + rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
16623 @@ -2104,6 +2107,16 @@
16624 requeue_pi_wake_futex(this, &key2, hb2);
16627 + } else if (ret == -EAGAIN) {
16629 + * Waiter was woken by timeout or
16630 + * signal and has set pi_blocked_on to
16631 + * PI_WAKEUP_INPROGRESS before we
16632 + * tried to enqueue it on the rtmutex.
16634 + this->pi_state = NULL;
16635 + put_pi_state(pi_state);
16639 * rt_mutex_start_proxy_lock() detected a
16640 @@ -2642,10 +2655,9 @@
16644 - hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
16645 - CLOCK_REALTIME : CLOCK_MONOTONIC,
16646 - HRTIMER_MODE_ABS);
16647 - hrtimer_init_sleeper(to, current);
16648 + hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ?
16649 + CLOCK_REALTIME : CLOCK_MONOTONIC,
16650 + HRTIMER_MODE_ABS, current);
16651 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
16652 current->timer_slack_ns);
16654 @@ -2744,9 +2756,8 @@
16658 - hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
16659 - HRTIMER_MODE_ABS);
16660 - hrtimer_init_sleeper(to, current);
16661 + hrtimer_init_sleeper_on_stack(to, CLOCK_REALTIME,
16662 + HRTIMER_MODE_ABS, current);
16663 hrtimer_set_expires(&to->timer, *time);
16666 @@ -2801,7 +2812,7 @@
16670 - rt_mutex_init_waiter(&rt_waiter);
16671 + rt_mutex_init_waiter(&rt_waiter, false);
16674 * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
16675 @@ -2816,9 +2827,18 @@
16676 * lock handoff sequence.
16678 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
16680 + * the migrate_disable() here disables migration in the in_atomic() fast
16681 + * path which is enabled again in the following spin_unlock(). We have
16682 + * one migrate_disable() pending in the slow-path which is reversed
16683 + * after the raw_spin_unlock_irq() where we leave the atomic context.
16685 + migrate_disable();
16687 spin_unlock(q.lock_ptr);
16688 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
16689 raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
16690 + migrate_enable();
16694 @@ -2965,11 +2985,21 @@
16697 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
16699 + * Magic trickery for now to make the RT migrate disable
16700 + * logic happy. The following spin_unlock() happens with
16701 + * interrupts disabled so the internal migrate_enable()
16702 + * won't undo the migrate_disable() which was issued when
16703 + * locking hb->lock.
16705 + migrate_disable();
16706 spin_unlock(&hb->lock);
16708 /* drops pi_state->pi_mutex.wait_lock */
16709 ret = wake_futex_pi(uaddr, uval, pi_state);
16711 + migrate_enable();
16713 put_pi_state(pi_state);
16716 @@ -3127,7 +3157,7 @@
16717 struct hrtimer_sleeper timeout, *to = NULL;
16718 struct futex_pi_state *pi_state = NULL;
16719 struct rt_mutex_waiter rt_waiter;
16720 - struct futex_hash_bucket *hb;
16721 + struct futex_hash_bucket *hb, *hb2;
16722 union futex_key key2 = FUTEX_KEY_INIT;
16723 struct futex_q q = futex_q_init;
16725 @@ -3143,10 +3173,9 @@
16729 - hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
16730 - CLOCK_REALTIME : CLOCK_MONOTONIC,
16731 - HRTIMER_MODE_ABS);
16732 - hrtimer_init_sleeper(to, current);
16733 + hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ?
16734 + CLOCK_REALTIME : CLOCK_MONOTONIC,
16735 + HRTIMER_MODE_ABS, current);
16736 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
16737 current->timer_slack_ns);
16739 @@ -3155,7 +3184,7 @@
16740 * The waiter is allocated on our stack, manipulated by the requeue
16741 * code while we sleep on uaddr.
16743 - rt_mutex_init_waiter(&rt_waiter);
16744 + rt_mutex_init_waiter(&rt_waiter, false);
16746 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
16747 if (unlikely(ret != 0))
16748 @@ -3186,20 +3215,55 @@
16749 /* Queue the futex_q, drop the hb lock, wait for wakeup. */
16750 futex_wait_queue_me(hb, &q, to);
16752 - spin_lock(&hb->lock);
16753 - ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
16754 - spin_unlock(&hb->lock);
16756 - goto out_put_keys;
16758 + * On RT we must avoid races with requeue and trying to block
16759 + * on two mutexes (hb->lock and uaddr2's rtmutex) by
16760 + * serializing access to pi_blocked_on with pi_lock.
16762 + raw_spin_lock_irq(¤t->pi_lock);
16763 + if (current->pi_blocked_on) {
16765 + * We have been requeued or are in the process of
16766 + * being requeued.
16768 + raw_spin_unlock_irq(¤t->pi_lock);
16771 + * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
16772 + * prevents a concurrent requeue from moving us to the
16773 + * uaddr2 rtmutex. After that we can safely acquire
16774 + * (and possibly block on) hb->lock.
16776 + current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
16777 + raw_spin_unlock_irq(¤t->pi_lock);
16779 + spin_lock(&hb->lock);
16782 + * Clean up pi_blocked_on. We might leak it otherwise
16783 + * when we succeeded with the hb->lock in the fast
16786 + raw_spin_lock_irq(¤t->pi_lock);
16787 + current->pi_blocked_on = NULL;
16788 + raw_spin_unlock_irq(¤t->pi_lock);
16790 + ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
16791 + spin_unlock(&hb->lock);
16793 + goto out_put_keys;
16797 - * In order for us to be here, we know our q.key == key2, and since
16798 - * we took the hb->lock above, we also know that futex_requeue() has
16799 - * completed and we no longer have to concern ourselves with a wakeup
16800 - * race with the atomic proxy lock acquisition by the requeue code. The
16801 - * futex_requeue dropped our key1 reference and incremented our key2
16802 - * reference count.
16803 + * In order to be here, we have either been requeued, are in
16804 + * the process of being requeued, or requeue successfully
16805 + * acquired uaddr2 on our behalf. If pi_blocked_on was
16806 + * non-null above, we may be racing with a requeue. Do not
16807 + * rely on q->lock_ptr to be hb2->lock until after blocking on
16808 + * hb->lock or hb2->lock. The futex_requeue dropped our key1
16809 + * reference and incremented our key2 reference count.
16811 + hb2 = hash_futex(&key2);
16813 /* Check if the requeue code acquired the second futex for us. */
16814 if (!q.rt_waiter) {
16815 @@ -3208,7 +3272,8 @@
16816 * did a lock-steal - fix up the PI-state in that case.
16818 if (q.pi_state && (q.pi_state->owner != current)) {
16819 - spin_lock(q.lock_ptr);
16820 + spin_lock(&hb2->lock);
16821 + BUG_ON(&hb2->lock != q.lock_ptr);
16822 ret = fixup_pi_state_owner(uaddr2, &q, current);
16823 if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
16824 pi_state = q.pi_state;
16825 @@ -3219,7 +3284,7 @@
16826 * the requeue_pi() code acquired for us.
16828 put_pi_state(q.pi_state);
16829 - spin_unlock(q.lock_ptr);
16830 + spin_unlock(&hb2->lock);
16833 struct rt_mutex *pi_mutex;
16834 @@ -3233,7 +3298,8 @@
16835 pi_mutex = &q.pi_state->pi_mutex;
16836 ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
16838 - spin_lock(q.lock_ptr);
16839 + spin_lock(&hb2->lock);
16840 + BUG_ON(&hb2->lock != q.lock_ptr);
16841 if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
16844 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/irq/handle.c linux-4.14/kernel/irq/handle.c
16845 --- linux-4.14.orig/kernel/irq/handle.c 2017-11-12 19:46:13.000000000 +0100
16846 +++ linux-4.14/kernel/irq/handle.c 2018-09-05 11:05:07.000000000 +0200
16847 @@ -183,10 +183,16 @@
16849 irqreturn_t retval;
16850 unsigned int flags = 0;
16851 + struct pt_regs *regs = get_irq_regs();
16852 + u64 ip = regs ? instruction_pointer(regs) : 0;
16854 retval = __handle_irq_event_percpu(desc, &flags);
16856 - add_interrupt_randomness(desc->irq_data.irq, flags);
16857 +#ifdef CONFIG_PREEMPT_RT_FULL
16858 + desc->random_ip = ip;
16860 + add_interrupt_randomness(desc->irq_data.irq, flags, ip);
16864 note_interrupt(desc, retval);
16865 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/irq/manage.c linux-4.14/kernel/irq/manage.c
16866 --- linux-4.14.orig/kernel/irq/manage.c 2018-09-05 11:03:22.000000000 +0200
16867 +++ linux-4.14/kernel/irq/manage.c 2018-09-05 11:05:07.000000000 +0200
16869 #include "internals.h"
16871 #ifdef CONFIG_IRQ_FORCED_THREADING
16872 +# ifndef CONFIG_PREEMPT_RT_BASE
16873 __read_mostly bool force_irqthreads;
16875 static int __init setup_forced_irqthreads(char *arg)
16879 early_param("threadirqs", setup_forced_irqthreads);
16883 static void __synchronize_hardirq(struct irq_desc *desc)
16884 @@ -224,7 +226,12 @@
16886 if (desc->affinity_notify) {
16887 kref_get(&desc->affinity_notify->kref);
16889 +#ifdef CONFIG_PREEMPT_RT_BASE
16890 + swork_queue(&desc->affinity_notify->swork);
16892 schedule_work(&desc->affinity_notify->work);
16895 irqd_set(data, IRQD_AFFINITY_SET);
16897 @@ -262,10 +269,8 @@
16899 EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
16901 -static void irq_affinity_notify(struct work_struct *work)
16902 +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
16904 - struct irq_affinity_notify *notify =
16905 - container_of(work, struct irq_affinity_notify, work);
16906 struct irq_desc *desc = irq_to_desc(notify->irq);
16907 cpumask_var_t cpumask;
16908 unsigned long flags;
16909 @@ -287,6 +292,35 @@
16910 kref_put(¬ify->kref, notify->release);
16913 +#ifdef CONFIG_PREEMPT_RT_BASE
16914 +static void init_helper_thread(void)
16916 + static int init_sworker_once;
16918 + if (init_sworker_once)
16920 + if (WARN_ON(swork_get()))
16922 + init_sworker_once = 1;
16925 +static void irq_affinity_notify(struct swork_event *swork)
16927 + struct irq_affinity_notify *notify =
16928 + container_of(swork, struct irq_affinity_notify, swork);
16929 + _irq_affinity_notify(notify);
16934 +static void irq_affinity_notify(struct work_struct *work)
16936 + struct irq_affinity_notify *notify =
16937 + container_of(work, struct irq_affinity_notify, work);
16938 + _irq_affinity_notify(notify);
16943 * irq_set_affinity_notifier - control notification of IRQ affinity changes
16944 * @irq: Interrupt for which to enable/disable notification
16945 @@ -315,7 +349,12 @@
16948 kref_init(¬ify->kref);
16949 +#ifdef CONFIG_PREEMPT_RT_BASE
16950 + INIT_SWORK(¬ify->swork, irq_affinity_notify);
16951 + init_helper_thread();
16953 INIT_WORK(¬ify->work, irq_affinity_notify);
16957 raw_spin_lock_irqsave(&desc->lock, flags);
16958 @@ -883,7 +922,15 @@
16959 local_bh_disable();
16960 ret = action->thread_fn(action->irq, action->dev_id);
16961 irq_finalize_oneshot(desc, action);
16962 - local_bh_enable();
16964 + * Interrupts which have real time requirements can be set up
16965 + * to avoid softirq processing in the thread handler. This is
16966 + * safe as these interrupts do not raise soft interrupts.
16968 + if (irq_settings_no_softirq_call(desc))
16969 + _local_bh_enable();
16971 + local_bh_enable();
16975 @@ -980,6 +1027,12 @@
16976 if (action_ret == IRQ_WAKE_THREAD)
16977 irq_wake_secondary(desc, action);
16979 +#ifdef CONFIG_PREEMPT_RT_FULL
16980 + migrate_disable();
16981 + add_interrupt_randomness(action->irq, 0,
16982 + desc->random_ip ^ (unsigned long) action);
16983 + migrate_enable();
16985 wake_threads_waitq(desc);
16988 @@ -1378,6 +1431,9 @@
16989 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
16992 + if (new->flags & IRQF_NO_SOFTIRQ_CALL)
16993 + irq_settings_set_no_softirq_call(desc);
16995 if (irq_settings_can_autoenable(desc)) {
16996 irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
16998 @@ -2159,7 +2215,7 @@
16999 * This call sets the internal irqchip state of an interrupt,
17000 * depending on the value of @which.
17002 - * This function should be called with preemption disabled if the
17003 + * This function should be called with migration disabled if the
17004 * interrupt controller has per-cpu registers.
17006 int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
17007 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/irq/settings.h linux-4.14/kernel/irq/settings.h
17008 --- linux-4.14.orig/kernel/irq/settings.h 2017-11-12 19:46:13.000000000 +0100
17009 +++ linux-4.14/kernel/irq/settings.h 2018-09-05 11:05:07.000000000 +0200
17011 _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
17012 _IRQ_IS_POLLED = IRQ_IS_POLLED,
17013 _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY,
17014 + _IRQ_NO_SOFTIRQ_CALL = IRQ_NO_SOFTIRQ_CALL,
17015 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
17019 #define IRQ_PER_CPU_DEVID GOT_YOU_MORON
17020 #define IRQ_IS_POLLED GOT_YOU_MORON
17021 #define IRQ_DISABLE_UNLAZY GOT_YOU_MORON
17022 +#define IRQ_NO_SOFTIRQ_CALL GOT_YOU_MORON
17023 #undef IRQF_MODIFY_MASK
17024 #define IRQF_MODIFY_MASK GOT_YOU_MORON
17027 desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
17030 +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
17032 + return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
17035 +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
17037 + desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
17040 static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
17042 return desc->status_use_accessors & _IRQ_PER_CPU;
17043 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/irq/spurious.c linux-4.14/kernel/irq/spurious.c
17044 --- linux-4.14.orig/kernel/irq/spurious.c 2017-11-12 19:46:13.000000000 +0100
17045 +++ linux-4.14/kernel/irq/spurious.c 2018-09-05 11:05:07.000000000 +0200
17046 @@ -445,6 +445,10 @@
17048 static int __init irqfixup_setup(char *str)
17050 +#ifdef CONFIG_PREEMPT_RT_BASE
17051 + pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
17055 printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
17056 printk(KERN_WARNING "This may impact system performance.\n");
17057 @@ -457,6 +461,10 @@
17059 static int __init irqpoll_setup(char *str)
17061 +#ifdef CONFIG_PREEMPT_RT_BASE
17062 + pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
17066 printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
17068 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/irq_work.c linux-4.14/kernel/irq_work.c
17069 --- linux-4.14.orig/kernel/irq_work.c 2017-11-12 19:46:13.000000000 +0100
17070 +++ linux-4.14/kernel/irq_work.c 2018-09-05 11:05:07.000000000 +0200
17072 #include <linux/cpu.h>
17073 #include <linux/notifier.h>
17074 #include <linux/smp.h>
17075 +#include <linux/interrupt.h>
17076 #include <asm/processor.h>
17081 bool irq_work_queue_on(struct irq_work *work, int cpu)
17083 + struct llist_head *list;
17085 /* All work should have been flushed before going offline */
17086 WARN_ON_ONCE(cpu_is_offline(cpu));
17089 if (!irq_work_claim(work))
17092 - if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
17093 + if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
17094 + list = &per_cpu(lazy_list, cpu);
17096 + list = &per_cpu(raised_list, cpu);
17098 + if (llist_add(&work->llnode, list))
17099 arch_send_call_function_single_ipi(cpu);
17103 /* Enqueue the irq work @work on the current CPU */
17104 bool irq_work_queue(struct irq_work *work)
17106 + struct llist_head *list;
17107 + bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
17109 /* Only queue if not already pending */
17110 if (!irq_work_claim(work))
17112 @@ -93,13 +104,15 @@
17113 /* Queue the entry and raise the IPI if needed. */
17116 - /* If the work is "lazy", handle it from next tick if any */
17117 - if (work->flags & IRQ_WORK_LAZY) {
17118 - if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
17119 - tick_nohz_tick_stopped())
17120 - arch_irq_work_raise();
17122 - if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
17123 + lazy_work = work->flags & IRQ_WORK_LAZY;
17125 + if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
17126 + list = this_cpu_ptr(&lazy_list);
17128 + list = this_cpu_ptr(&raised_list);
17130 + if (llist_add(&work->llnode, list)) {
17131 + if (!lazy_work || tick_nohz_tick_stopped())
17132 arch_irq_work_raise();
17135 @@ -116,9 +129,8 @@
17136 raised = this_cpu_ptr(&raised_list);
17137 lazy = this_cpu_ptr(&lazy_list);
17139 - if (llist_empty(raised) || arch_irq_work_has_interrupt())
17140 - if (llist_empty(lazy))
17142 + if (llist_empty(raised) && llist_empty(lazy))
17145 /* All work should have been flushed before going offline */
17146 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
17147 @@ -132,7 +144,7 @@
17148 struct irq_work *work;
17149 struct llist_node *llnode;
17151 - BUG_ON(!irqs_disabled());
17152 + BUG_ON_NONRT(!irqs_disabled());
17154 if (llist_empty(list))
17156 @@ -169,7 +181,16 @@
17157 void irq_work_run(void)
17159 irq_work_run_list(this_cpu_ptr(&raised_list));
17160 - irq_work_run_list(this_cpu_ptr(&lazy_list));
17161 + if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
17163 + * NOTE: we raise softirq via IPI for safety,
17164 + * and execute in irq_work_tick() to move the
17165 + * overhead from hard to soft irq context.
17167 + if (!llist_empty(this_cpu_ptr(&lazy_list)))
17168 + raise_softirq(TIMER_SOFTIRQ);
17170 + irq_work_run_list(this_cpu_ptr(&lazy_list));
17172 EXPORT_SYMBOL_GPL(irq_work_run);
17174 @@ -179,8 +200,17 @@
17176 if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
17177 irq_work_run_list(raised);
17179 + if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
17180 + irq_work_run_list(this_cpu_ptr(&lazy_list));
17183 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
17184 +void irq_work_tick_soft(void)
17186 irq_work_run_list(this_cpu_ptr(&lazy_list));
17191 * Synchronize against the irq_work @entry, ensures the entry is not
17192 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/Kconfig.locks linux-4.14/kernel/Kconfig.locks
17193 --- linux-4.14.orig/kernel/Kconfig.locks 2017-11-12 19:46:13.000000000 +0100
17194 +++ linux-4.14/kernel/Kconfig.locks 2018-09-05 11:05:07.000000000 +0200
17195 @@ -225,11 +225,11 @@
17197 config MUTEX_SPIN_ON_OWNER
17199 - depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW
17200 + depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
17202 config RWSEM_SPIN_ON_OWNER
17204 - depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
17205 + depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
17207 config LOCK_SPIN_ON_OWNER
17209 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/Kconfig.preempt linux-4.14/kernel/Kconfig.preempt
17210 --- linux-4.14.orig/kernel/Kconfig.preempt 2017-11-12 19:46:13.000000000 +0100
17211 +++ linux-4.14/kernel/Kconfig.preempt 2018-09-05 11:05:07.000000000 +0200
17215 + select PREEMPT_COUNT
17217 +config PREEMPT_RT_BASE
17221 +config HAVE_PREEMPT_LAZY
17224 +config PREEMPT_LAZY
17225 + def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
17228 prompt "Preemption Model"
17231 Select this if you are building a kernel for a desktop system.
17234 +config PREEMPT__LL
17235 bool "Preemptible Kernel (Low-Latency Desktop)"
17236 - select PREEMPT_COUNT
17238 select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
17240 This option reduces the latency of the kernel by making
17242 embedded system with latency requirements in the milliseconds
17245 +config PREEMPT_RTB
17246 + bool "Preemptible Kernel (Basic RT)"
17247 + select PREEMPT_RT_BASE
17249 + This option is basically the same as (Low-Latency Desktop) but
17250 + enables changes which are preliminary for the full preemptible
17253 +config PREEMPT_RT_FULL
17254 + bool "Fully Preemptible Kernel (RT)"
17255 + depends on IRQ_FORCED_THREADING
17256 + select PREEMPT_RT_BASE
17257 + select PREEMPT_RCU
17259 + All and everything
17263 config PREEMPT_COUNT
17264 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/ksysfs.c linux-4.14/kernel/ksysfs.c
17265 --- linux-4.14.orig/kernel/ksysfs.c 2017-11-12 19:46:13.000000000 +0100
17266 +++ linux-4.14/kernel/ksysfs.c 2018-09-05 11:05:07.000000000 +0200
17267 @@ -140,6 +140,15 @@
17269 #endif /* CONFIG_CRASH_CORE */
17271 +#if defined(CONFIG_PREEMPT_RT_FULL)
17272 +static ssize_t realtime_show(struct kobject *kobj,
17273 + struct kobj_attribute *attr, char *buf)
17275 + return sprintf(buf, "%d\n", 1);
17277 +KERNEL_ATTR_RO(realtime);
17280 /* whether file capabilities are enabled */
17281 static ssize_t fscaps_show(struct kobject *kobj,
17282 struct kobj_attribute *attr, char *buf)
17283 @@ -231,6 +240,9 @@
17284 &rcu_expedited_attr.attr,
17285 &rcu_normal_attr.attr,
17287 +#ifdef CONFIG_PREEMPT_RT_FULL
17288 + &realtime_attr.attr,
17293 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/lockdep.c linux-4.14/kernel/locking/lockdep.c
17294 --- linux-4.14.orig/kernel/locking/lockdep.c 2018-09-05 11:03:29.000000000 +0200
17295 +++ linux-4.14/kernel/locking/lockdep.c 2018-09-05 11:05:07.000000000 +0200
17296 @@ -3916,6 +3916,7 @@
17300 +#ifndef CONFIG_PREEMPT_RT_FULL
17302 * We dont accurately track softirq state in e.g.
17303 * hardirq contexts (such as on 4KSTACKS), so only
17304 @@ -3930,6 +3931,7 @@
17305 DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
17311 print_irqtrace_events(current);
17312 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/locktorture.c linux-4.14/kernel/locking/locktorture.c
17313 --- linux-4.14.orig/kernel/locking/locktorture.c 2018-09-05 11:03:22.000000000 +0200
17314 +++ linux-4.14/kernel/locking/locktorture.c 2018-09-05 11:05:07.000000000 +0200
17316 #include <linux/kthread.h>
17317 #include <linux/sched/rt.h>
17318 #include <linux/spinlock.h>
17319 -#include <linux/rwlock.h>
17320 #include <linux/mutex.h>
17321 #include <linux/rwsem.h>
17322 #include <linux/smp.h>
17323 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/Makefile linux-4.14/kernel/locking/Makefile
17324 --- linux-4.14.orig/kernel/locking/Makefile 2017-11-12 19:46:13.000000000 +0100
17325 +++ linux-4.14/kernel/locking/Makefile 2018-09-05 11:05:07.000000000 +0200
17327 # and is generally not a function of system call inputs.
17328 KCOV_INSTRUMENT := n
17330 -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
17331 +obj-y += semaphore.o percpu-rwsem.o
17333 ifdef CONFIG_FUNCTION_TRACER
17334 CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
17336 CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
17339 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
17341 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
17344 obj-$(CONFIG_LOCKDEP) += lockdep.o
17345 ifeq ($(CONFIG_PROC_FS),y)
17346 obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
17348 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
17349 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
17350 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
17351 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
17352 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
17353 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
17355 +obj-$(CONFIG_PREEMPT_RT_FULL) += mutex-rt.o rwsem-rt.o rwlock-rt.o
17356 obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
17357 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
17358 obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
17359 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/mutex-rt.c linux-4.14/kernel/locking/mutex-rt.c
17360 --- linux-4.14.orig/kernel/locking/mutex-rt.c 1970-01-01 01:00:00.000000000 +0100
17361 +++ linux-4.14/kernel/locking/mutex-rt.c 2018-09-05 11:05:07.000000000 +0200
17366 + * Real-Time Preemption Support
17368 + * started by Ingo Molnar:
17370 + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17371 + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
17373 + * historic credit for proving that Linux spinlocks can be implemented via
17374 + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
17375 + * and others) who prototyped it on 2.4 and did lots of comparative
17376 + * research and analysis; TimeSys, for proving that you can implement a
17377 + * fully preemptible kernel via the use of IRQ threading and mutexes;
17378 + * Bill Huey for persuasively arguing on lkml that the mutex model is the
17379 + * right one; and to MontaVista, who ported pmutexes to 2.6.
17381 + * This code is a from-scratch implementation and is not based on pmutexes,
17382 + * but the idea of converting spinlocks to mutexes is used here too.
17384 + * lock debugging, locking tree, deadlock detection:
17386 + * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
17387 + * Released under the General Public License (GPL).
17389 + * Includes portions of the generic R/W semaphore implementation from:
17391 + * Copyright (c) 2001 David Howells (dhowells@redhat.com).
17392 + * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
17393 + * - Derived also from comments by Linus
17395 + * Pending ownership of locks and ownership stealing:
17397 + * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
17399 + * (also by Steven Rostedt)
17400 + * - Converted single pi_lock to individual task locks.
17402 + * By Esben Nielsen:
17403 + * Doing priority inheritance with help of the scheduler.
17405 + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
17406 + * - major rework based on Esben Nielsens initial patch
17407 + * - replaced thread_info references by task_struct refs
17408 + * - removed task->pending_owner dependency
17409 + * - BKL drop/reacquire for semaphore style locks to avoid deadlocks
17410 + * in the scheduler return path as discussed with Steven Rostedt
17412 + * Copyright (C) 2006, Kihon Technologies Inc.
17413 + * Steven Rostedt <rostedt@goodmis.org>
17414 + * - debugged and patched Thomas Gleixner's rework.
17415 + * - added back the cmpxchg to the rework.
17416 + * - turned atomic require back on for SMP.
17419 +#include <linux/spinlock.h>
17420 +#include <linux/rtmutex.h>
17421 +#include <linux/sched.h>
17422 +#include <linux/delay.h>
17423 +#include <linux/module.h>
17424 +#include <linux/kallsyms.h>
17425 +#include <linux/syscalls.h>
17426 +#include <linux/interrupt.h>
17427 +#include <linux/plist.h>
17428 +#include <linux/fs.h>
17429 +#include <linux/futex.h>
17430 +#include <linux/hrtimer.h>
17432 +#include "rtmutex_common.h"
17435 + * struct mutex functions
17437 +void __mutex_do_init(struct mutex *mutex, const char *name,
17438 + struct lock_class_key *key)
17440 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
17442 + * Make sure we are not reinitializing a held lock:
17444 + debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
17445 + lockdep_init_map(&mutex->dep_map, name, key, 0);
17447 + mutex->lock.save_state = 0;
17449 +EXPORT_SYMBOL(__mutex_do_init);
17451 +void __lockfunc _mutex_lock(struct mutex *lock)
17453 + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
17454 + __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
17456 +EXPORT_SYMBOL(_mutex_lock);
17458 +void __lockfunc _mutex_lock_io(struct mutex *lock)
17462 + token = io_schedule_prepare();
17463 + _mutex_lock(lock);
17464 + io_schedule_finish(token);
17466 +EXPORT_SYMBOL_GPL(_mutex_lock_io);
17468 +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
17472 + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
17473 + ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE);
17475 + mutex_release(&lock->dep_map, 1, _RET_IP_);
17478 +EXPORT_SYMBOL(_mutex_lock_interruptible);
17480 +int __lockfunc _mutex_lock_killable(struct mutex *lock)
17484 + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
17485 + ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE);
17487 + mutex_release(&lock->dep_map, 1, _RET_IP_);
17490 +EXPORT_SYMBOL(_mutex_lock_killable);
17492 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
17493 +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
17495 + mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
17496 + __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
17498 +EXPORT_SYMBOL(_mutex_lock_nested);
17500 +void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass)
17504 + token = io_schedule_prepare();
17506 + mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
17507 + __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
17509 + io_schedule_finish(token);
17511 +EXPORT_SYMBOL_GPL(_mutex_lock_io_nested);
17513 +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
17515 + mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
17516 + __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
17518 +EXPORT_SYMBOL(_mutex_lock_nest_lock);
17520 +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
17524 + mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
17525 + ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE);
17527 + mutex_release(&lock->dep_map, 1, _RET_IP_);
17530 +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
17532 +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
17536 + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
17537 + ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE);
17539 + mutex_release(&lock->dep_map, 1, _RET_IP_);
17542 +EXPORT_SYMBOL(_mutex_lock_killable_nested);
17545 +int __lockfunc _mutex_trylock(struct mutex *lock)
17547 + int ret = __rt_mutex_trylock(&lock->lock);
17550 + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
17554 +EXPORT_SYMBOL(_mutex_trylock);
17556 +void __lockfunc _mutex_unlock(struct mutex *lock)
17558 + mutex_release(&lock->dep_map, 1, _RET_IP_);
17559 + __rt_mutex_unlock(&lock->lock);
17561 +EXPORT_SYMBOL(_mutex_unlock);
17564 + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
17565 + * @cnt: the atomic which we are to dec
17566 + * @lock: the mutex to return holding if we dec to 0
17568 + * return true and hold lock if we dec to 0, return false otherwise
17570 +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
17572 + /* dec if we can't possibly hit 0 */
17573 + if (atomic_add_unless(cnt, -1, 1))
17575 + /* we might hit 0, so take the lock */
17576 + mutex_lock(lock);
17577 + if (!atomic_dec_and_test(cnt)) {
17578 + /* when we actually did the dec, we didn't hit 0 */
17579 + mutex_unlock(lock);
17582 + /* we hit 0, and we hold the lock */
17585 +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
17586 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/rtmutex.c linux-4.14/kernel/locking/rtmutex.c
17587 --- linux-4.14.orig/kernel/locking/rtmutex.c 2018-09-05 11:03:22.000000000 +0200
17588 +++ linux-4.14/kernel/locking/rtmutex.c 2018-09-05 11:05:07.000000000 +0200
17590 * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
17591 * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
17592 * Copyright (C) 2006 Esben Nielsen
17593 + * Adaptive Spinlocks:
17594 + * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
17595 + * and Peter Morreale,
17596 + * Adaptive Spinlocks simplification:
17597 + * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
17599 * See Documentation/locking/rt-mutex-design.txt for details.
17602 #include <linux/sched/wake_q.h>
17603 #include <linux/sched/debug.h>
17604 #include <linux/timer.h>
17605 +#include <linux/ww_mutex.h>
17606 +#include <linux/blkdev.h>
17608 #include "rtmutex_common.h"
17610 @@ -135,6 +142,12 @@
17611 WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
17614 +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
17616 + return waiter && waiter != PI_WAKEUP_INPROGRESS &&
17617 + waiter != PI_REQUEUE_INPROGRESS;
17621 * We can speed up the acquire/release, if there's no debugging state to be
17623 @@ -228,7 +241,7 @@
17624 * Only use with rt_mutex_waiter_{less,equal}()
17626 #define task_to_waiter(p) \
17627 - &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
17628 + &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline, .task = (p) }
17631 rt_mutex_waiter_less(struct rt_mutex_waiter *left,
17632 @@ -268,6 +281,27 @@
17636 +#define STEAL_NORMAL 0
17637 +#define STEAL_LATERAL 1
17640 +rt_mutex_steal(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, int mode)
17642 + struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
17644 + if (waiter == top_waiter || rt_mutex_waiter_less(waiter, top_waiter))
17648 + * Note that RT tasks are excluded from lateral-steals
17649 + * to prevent the introduction of an unbounded latency.
17651 + if (mode == STEAL_NORMAL || rt_task(waiter->task))
17654 + return rt_mutex_waiter_equal(waiter, top_waiter);
17658 rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
17660 @@ -372,6 +406,14 @@
17661 return debug_rt_mutex_detect_deadlock(waiter, chwalk);
17664 +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
17666 + if (waiter->savestate)
17667 + wake_up_lock_sleeper(waiter->task);
17669 + wake_up_process(waiter->task);
17673 * Max number of times we'll walk the boosting chain:
17675 @@ -379,7 +421,8 @@
17677 static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
17679 - return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
17680 + return rt_mutex_real_waiter(p->pi_blocked_on) ?
17681 + p->pi_blocked_on->lock : NULL;
17685 @@ -515,7 +558,7 @@
17686 * reached or the state of the chain has changed while we
17687 * dropped the locks.
17690 + if (!rt_mutex_real_waiter(waiter))
17691 goto out_unlock_pi;
17694 @@ -696,13 +739,16 @@
17695 * follow here. This is the end of the chain we are walking.
17697 if (!rt_mutex_owner(lock)) {
17698 + struct rt_mutex_waiter *lock_top_waiter;
17701 * If the requeue [7] above changed the top waiter,
17702 * then we need to wake the new top waiter up to try
17705 - if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
17706 - wake_up_process(rt_mutex_top_waiter(lock)->task);
17707 + lock_top_waiter = rt_mutex_top_waiter(lock);
17708 + if (prerequeue_top_waiter != lock_top_waiter)
17709 + rt_mutex_wake_waiter(lock_top_waiter);
17710 raw_spin_unlock_irq(&lock->wait_lock);
17713 @@ -804,9 +850,11 @@
17714 * @task: The task which wants to acquire the lock
17715 * @waiter: The waiter that is queued to the lock's wait tree if the
17716 * callsite called task_blocked_on_lock(), otherwise NULL
17717 + * @mode: Lock steal mode (STEAL_NORMAL, STEAL_LATERAL)
17719 -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
17720 - struct rt_mutex_waiter *waiter)
17721 +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
17722 + struct task_struct *task,
17723 + struct rt_mutex_waiter *waiter, int mode)
17725 lockdep_assert_held(&lock->wait_lock);
17727 @@ -842,12 +890,11 @@
17731 - * If waiter is not the highest priority waiter of
17732 - * @lock, give up.
17733 + * If waiter is not the highest priority waiter of @lock,
17734 + * or its peer when lateral steal is allowed, give up.
17736 - if (waiter != rt_mutex_top_waiter(lock))
17737 + if (!rt_mutex_steal(lock, waiter, mode))
17741 * We can acquire the lock. Remove the waiter from the
17742 * lock waiters tree.
17743 @@ -865,14 +912,12 @@
17745 if (rt_mutex_has_waiters(lock)) {
17747 - * If @task->prio is greater than or equal to
17748 - * the top waiter priority (kernel view),
17750 + * If @task->prio is greater than the top waiter
17751 + * priority (kernel view), or equal to it when a
17752 + * lateral steal is forbidden, @task lost.
17754 - if (!rt_mutex_waiter_less(task_to_waiter(task),
17755 - rt_mutex_top_waiter(lock)))
17756 + if (!rt_mutex_steal(lock, task_to_waiter(task), mode))
17760 * The current top waiter stays enqueued. We
17761 * don't have to change anything in the lock
17762 @@ -919,6 +964,351 @@
17766 +#ifdef CONFIG_PREEMPT_RT_FULL
17768 + * preemptible spin_lock functions:
17770 +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
17771 + void (*slowfn)(struct rt_mutex *lock))
17773 + might_sleep_no_state_check();
17775 + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
17781 +static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
17782 + void (*slowfn)(struct rt_mutex *lock))
17784 + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
17791 + * Note that owner is a speculative pointer and dereferencing relies
17792 + * on rcu_read_lock() and the check against the lock owner.
17794 +static int adaptive_wait(struct rt_mutex *lock,
17795 + struct task_struct *owner)
17801 + if (owner != rt_mutex_owner(lock))
17804 + * Ensure that owner->on_cpu is dereferenced _after_
17805 + * checking the above to be valid.
17808 + if (!owner->on_cpu) {
17814 + rcu_read_unlock();
17818 +static int adaptive_wait(struct rt_mutex *lock,
17819 + struct task_struct *orig_owner)
17825 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
17826 + struct rt_mutex_waiter *waiter,
17827 + struct task_struct *task,
17828 + enum rtmutex_chainwalk chwalk);
17830 + * Slow path lock function spin_lock style: this variant is very
17831 + * careful not to miss any non-lock wakeups.
17833 + * We store the current state under p->pi_lock in p->saved_state and
17834 + * the try_to_wake_up() code handles this accordingly.
17836 +void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock,
17837 + struct rt_mutex_waiter *waiter,
17838 + unsigned long flags)
17840 + struct task_struct *lock_owner, *self = current;
17841 + struct rt_mutex_waiter *top_waiter;
17844 + if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL))
17847 + BUG_ON(rt_mutex_owner(lock) == self);
17850 + * We save whatever state the task is in and we'll restore it
17851 + * after acquiring the lock taking real wakeups into account
17852 + * as well. We are serialized via pi_lock against wakeups. See
17853 + * try_to_wake_up().
17855 + raw_spin_lock(&self->pi_lock);
17856 + self->saved_state = self->state;
17857 + __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
17858 + raw_spin_unlock(&self->pi_lock);
17860 + ret = task_blocks_on_rt_mutex(lock, waiter, self, RT_MUTEX_MIN_CHAINWALK);
17864 + /* Try to acquire the lock again. */
17865 + if (__try_to_take_rt_mutex(lock, self, waiter, STEAL_LATERAL))
17868 + top_waiter = rt_mutex_top_waiter(lock);
17869 + lock_owner = rt_mutex_owner(lock);
17871 + raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
17873 + debug_rt_mutex_print_deadlock(waiter);
17875 + if (top_waiter != waiter || adaptive_wait(lock, lock_owner))
17878 + raw_spin_lock_irqsave(&lock->wait_lock, flags);
17880 + raw_spin_lock(&self->pi_lock);
17881 + __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
17882 + raw_spin_unlock(&self->pi_lock);
17886 + * Restore the task state to current->saved_state. We set it
17887 + * to the original state above and the try_to_wake_up() code
17888 + * has possibly updated it when a real (non-rtmutex) wakeup
17889 + * happened while we were blocked. Clear saved_state so
17890 + * try_to_wakeup() does not get confused.
17892 + raw_spin_lock(&self->pi_lock);
17893 + __set_current_state_no_track(self->saved_state);
17894 + self->saved_state = TASK_RUNNING;
17895 + raw_spin_unlock(&self->pi_lock);
17898 + * try_to_take_rt_mutex() sets the waiter bit
17899 + * unconditionally. We might have to fix that up:
17901 + fixup_rt_mutex_waiters(lock);
17903 + BUG_ON(rt_mutex_has_waiters(lock) && waiter == rt_mutex_top_waiter(lock));
17904 + BUG_ON(!RB_EMPTY_NODE(&waiter->tree_entry));
17907 +static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock)
17909 + struct rt_mutex_waiter waiter;
17910 + unsigned long flags;
17912 + rt_mutex_init_waiter(&waiter, true);
17914 + raw_spin_lock_irqsave(&lock->wait_lock, flags);
17915 + rt_spin_lock_slowlock_locked(lock, &waiter, flags);
17916 + raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
17917 + debug_rt_mutex_free_waiter(&waiter);
17920 +static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
17921 + struct wake_q_head *wake_q,
17922 + struct wake_q_head *wq_sleeper);
17924 + * Slow path to release a rt_mutex spin_lock style
17926 +void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
17928 + unsigned long flags;
17929 + DEFINE_WAKE_Q(wake_q);
17930 + DEFINE_WAKE_Q(wake_sleeper_q);
17933 + raw_spin_lock_irqsave(&lock->wait_lock, flags);
17934 + postunlock = __rt_mutex_unlock_common(lock, &wake_q, &wake_sleeper_q);
17935 + raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
17938 + rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
17941 +void __lockfunc rt_spin_lock(spinlock_t *lock)
17943 + sleeping_lock_inc();
17944 + migrate_disable();
17945 + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
17946 + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
17948 +EXPORT_SYMBOL(rt_spin_lock);
17950 +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
17952 + rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
17955 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
17956 +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
17958 + sleeping_lock_inc();
17959 + migrate_disable();
17960 + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
17961 + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
17963 +EXPORT_SYMBOL(rt_spin_lock_nested);
17966 +void __lockfunc rt_spin_unlock(spinlock_t *lock)
17968 + /* NOTE: we always pass in '1' for nested, for simplicity */
17969 + spin_release(&lock->dep_map, 1, _RET_IP_);
17970 + rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
17971 + migrate_enable();
17972 + sleeping_lock_dec();
17974 +EXPORT_SYMBOL(rt_spin_unlock);
17976 +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
17978 + rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
17980 +EXPORT_SYMBOL(__rt_spin_unlock);
17983 + * Wait for the lock to get unlocked: instead of polling for an unlock
17984 + * (like raw spinlocks do), we lock and unlock, to force the kernel to
17985 + * schedule if there's contention:
17987 +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
17990 + spin_unlock(lock);
17992 +EXPORT_SYMBOL(rt_spin_unlock_wait);
17994 +int __lockfunc rt_spin_trylock(spinlock_t *lock)
17998 + sleeping_lock_inc();
17999 + migrate_disable();
18000 + ret = __rt_mutex_trylock(&lock->lock);
18002 + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18004 + migrate_enable();
18005 + sleeping_lock_dec();
18009 +EXPORT_SYMBOL(rt_spin_trylock);
18011 +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
18015 + local_bh_disable();
18016 + ret = __rt_mutex_trylock(&lock->lock);
18018 + sleeping_lock_inc();
18019 + migrate_disable();
18020 + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18022 + local_bh_enable();
18025 +EXPORT_SYMBOL(rt_spin_trylock_bh);
18027 +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
18032 + ret = __rt_mutex_trylock(&lock->lock);
18034 + sleeping_lock_inc();
18035 + migrate_disable();
18036 + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18040 +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
18042 +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
18044 + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
18045 + if (atomic_add_unless(atomic, -1, 1))
18047 + rt_spin_lock(lock);
18048 + if (atomic_dec_and_test(atomic))
18050 + rt_spin_unlock(lock);
18053 +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
18056 +__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key)
18058 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18060 + * Make sure we are not reinitializing a held lock:
18062 + debug_check_no_locks_freed((void *)lock, sizeof(*lock));
18063 + lockdep_init_map(&lock->dep_map, name, key, 0);
18066 +EXPORT_SYMBOL(__rt_spin_lock_init);
18068 +#endif /* PREEMPT_RT_FULL */
18070 +#ifdef CONFIG_PREEMPT_RT_FULL
18071 + static inline int __sched
18072 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
18074 + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
18075 + struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
18080 + if (unlikely(ctx == hold_ctx))
18081 + return -EALREADY;
18083 + if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
18084 + (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
18085 +#ifdef CONFIG_DEBUG_MUTEXES
18086 + DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
18087 + ctx->contending_lock = ww;
18095 + static inline int __sched
18096 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
18105 +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18106 + struct rt_mutex_waiter *waiter)
18108 + return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
18112 * Task blocks on lock.
18114 @@ -951,6 +1341,22 @@
18117 raw_spin_lock(&task->pi_lock);
18119 + * In the case of futex requeue PI, this will be a proxy
18120 + * lock. The task will wake unaware that it is enqueueed on
18121 + * this lock. Avoid blocking on two locks and corrupting
18122 + * pi_blocked_on via the PI_WAKEUP_INPROGRESS
18123 + * flag. futex_wait_requeue_pi() sets this when it wakes up
18124 + * before requeue (due to a signal or timeout). Do not enqueue
18125 + * the task if PI_WAKEUP_INPROGRESS is set.
18127 + if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
18128 + raw_spin_unlock(&task->pi_lock);
18132 + BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
18134 waiter->task = task;
18135 waiter->lock = lock;
18136 waiter->prio = task->prio;
18137 @@ -974,7 +1380,7 @@
18138 rt_mutex_enqueue_pi(owner, waiter);
18140 rt_mutex_adjust_prio(owner);
18141 - if (owner->pi_blocked_on)
18142 + if (rt_mutex_real_waiter(owner->pi_blocked_on))
18144 } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
18146 @@ -1016,6 +1422,7 @@
18147 * Called with lock->wait_lock held and interrupts disabled.
18149 static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
18150 + struct wake_q_head *wake_sleeper_q,
18151 struct rt_mutex *lock)
18153 struct rt_mutex_waiter *waiter;
18154 @@ -1055,7 +1462,10 @@
18155 * Pairs with preempt_enable() in rt_mutex_postunlock();
18158 - wake_q_add(wake_q, waiter->task);
18159 + if (waiter->savestate)
18160 + wake_q_add_sleeper(wake_sleeper_q, waiter->task);
18162 + wake_q_add(wake_q, waiter->task);
18163 raw_spin_unlock(¤t->pi_lock);
18166 @@ -1070,7 +1480,7 @@
18168 bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
18169 struct task_struct *owner = rt_mutex_owner(lock);
18170 - struct rt_mutex *next_lock;
18171 + struct rt_mutex *next_lock = NULL;
18173 lockdep_assert_held(&lock->wait_lock);
18175 @@ -1096,7 +1506,8 @@
18176 rt_mutex_adjust_prio(owner);
18178 /* Store the lock on which owner is blocked or NULL */
18179 - next_lock = task_blocked_on_lock(owner);
18180 + if (rt_mutex_real_waiter(owner->pi_blocked_on))
18181 + next_lock = task_blocked_on_lock(owner);
18183 raw_spin_unlock(&owner->pi_lock);
18185 @@ -1132,26 +1543,28 @@
18186 raw_spin_lock_irqsave(&task->pi_lock, flags);
18188 waiter = task->pi_blocked_on;
18189 - if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
18190 + if (!rt_mutex_real_waiter(waiter) ||
18191 + rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
18192 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18195 next_lock = waiter->lock;
18196 - raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18198 /* gets dropped in rt_mutex_adjust_prio_chain()! */
18199 get_task_struct(task);
18201 + raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18202 rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
18203 next_lock, NULL, task);
18206 -void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
18207 +void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
18209 debug_rt_mutex_init_waiter(waiter);
18210 RB_CLEAR_NODE(&waiter->pi_tree_entry);
18211 RB_CLEAR_NODE(&waiter->tree_entry);
18212 waiter->task = NULL;
18213 + waiter->savestate = savestate;
18217 @@ -1167,7 +1580,8 @@
18219 __rt_mutex_slowlock(struct rt_mutex *lock, int state,
18220 struct hrtimer_sleeper *timeout,
18221 - struct rt_mutex_waiter *waiter)
18222 + struct rt_mutex_waiter *waiter,
18223 + struct ww_acquire_ctx *ww_ctx)
18227 @@ -1176,16 +1590,17 @@
18228 if (try_to_take_rt_mutex(lock, current, waiter))
18232 - * TASK_INTERRUPTIBLE checks for signals and
18233 - * timeout. Ignored otherwise.
18235 - if (likely(state == TASK_INTERRUPTIBLE)) {
18236 - /* Signal pending? */
18237 - if (signal_pending(current))
18239 - if (timeout && !timeout->task)
18240 - ret = -ETIMEDOUT;
18241 + if (timeout && !timeout->task) {
18242 + ret = -ETIMEDOUT;
18245 + if (signal_pending_state(state, current)) {
18250 + if (ww_ctx && ww_ctx->acquired > 0) {
18251 + ret = __mutex_lock_check_stamp(lock, ww_ctx);
18255 @@ -1224,33 +1639,104 @@
18260 - * Slow path lock function:
18262 -static int __sched
18263 -rt_mutex_slowlock(struct rt_mutex *lock, int state,
18264 - struct hrtimer_sleeper *timeout,
18265 - enum rtmutex_chainwalk chwalk)
18266 +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
18267 + struct ww_acquire_ctx *ww_ctx)
18269 - struct rt_mutex_waiter waiter;
18270 - unsigned long flags;
18272 +#ifdef CONFIG_DEBUG_MUTEXES
18274 + * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
18275 + * but released with a normal mutex_unlock in this call.
18277 + * This should never happen, always use ww_mutex_unlock.
18279 + DEBUG_LOCKS_WARN_ON(ww->ctx);
18281 - rt_mutex_init_waiter(&waiter);
18283 + * Not quite done after calling ww_acquire_done() ?
18285 + DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
18287 + if (ww_ctx->contending_lock) {
18289 + * After -EDEADLK you tried to
18290 + * acquire a different ww_mutex? Bad!
18292 + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
18295 + * You called ww_mutex_lock after receiving -EDEADLK,
18296 + * but 'forgot' to unlock everything else first?
18298 + DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
18299 + ww_ctx->contending_lock = NULL;
18303 - * Technically we could use raw_spin_[un]lock_irq() here, but this can
18304 - * be called in early boot if the cmpxchg() fast path is disabled
18305 - * (debug, no architecture support). In this case we will acquire the
18306 - * rtmutex with lock->wait_lock held. But we cannot unconditionally
18307 - * enable interrupts in that early boot case. So we need to use the
18308 - * irqsave/restore variants.
18309 + * Naughty, using a different class will lead to undefined behavior!
18311 - raw_spin_lock_irqsave(&lock->wait_lock, flags);
18312 + DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
18314 + ww_ctx->acquired++;
18317 +#ifdef CONFIG_PREEMPT_RT_FULL
18318 +static void ww_mutex_account_lock(struct rt_mutex *lock,
18319 + struct ww_acquire_ctx *ww_ctx)
18321 + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
18322 + struct rt_mutex_waiter *waiter, *n;
18325 + * This branch gets optimized out for the common case,
18326 + * and is only important for ww_mutex_lock.
18328 + ww_mutex_lock_acquired(ww, ww_ctx);
18329 + ww->ctx = ww_ctx;
18332 + * Give any possible sleeping processes the chance to wake up,
18333 + * so they can recheck if they have to back off.
18335 + rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters.rb_root,
18337 + /* XXX debug rt mutex waiter wakeup */
18339 + BUG_ON(waiter->lock != lock);
18340 + rt_mutex_wake_waiter(waiter);
18346 +static void ww_mutex_account_lock(struct rt_mutex *lock,
18347 + struct ww_acquire_ctx *ww_ctx)
18353 +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
18354 + struct hrtimer_sleeper *timeout,
18355 + enum rtmutex_chainwalk chwalk,
18356 + struct ww_acquire_ctx *ww_ctx,
18357 + struct rt_mutex_waiter *waiter)
18361 +#ifdef CONFIG_PREEMPT_RT_FULL
18363 + struct ww_mutex *ww;
18365 + ww = container_of(lock, struct ww_mutex, base.lock);
18366 + if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
18367 + return -EALREADY;
18371 /* Try to acquire the lock again: */
18372 if (try_to_take_rt_mutex(lock, current, NULL)) {
18373 - raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18375 + ww_mutex_account_lock(lock, ww_ctx);
18379 @@ -1260,17 +1746,27 @@
18380 if (unlikely(timeout))
18381 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
18383 - ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
18384 + ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk);
18386 - if (likely(!ret))
18387 + if (likely(!ret)) {
18388 /* sleep on the mutex */
18389 - ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
18390 + ret = __rt_mutex_slowlock(lock, state, timeout, waiter,
18392 + } else if (ww_ctx) {
18393 + /* ww_mutex received EDEADLK, let it become EALREADY */
18394 + ret = __mutex_lock_check_stamp(lock, ww_ctx);
18398 if (unlikely(ret)) {
18399 __set_current_state(TASK_RUNNING);
18400 if (rt_mutex_has_waiters(lock))
18401 - remove_waiter(lock, &waiter);
18402 - rt_mutex_handle_deadlock(ret, chwalk, &waiter);
18403 + remove_waiter(lock, waiter);
18404 + /* ww_mutex want to report EDEADLK/EALREADY, let them */
18406 + rt_mutex_handle_deadlock(ret, chwalk, waiter);
18407 + } else if (ww_ctx) {
18408 + ww_mutex_account_lock(lock, ww_ctx);
18412 @@ -1278,6 +1774,36 @@
18413 * unconditionally. We might have to fix that up.
18415 fixup_rt_mutex_waiters(lock);
18420 + * Slow path lock function:
18422 +static int __sched
18423 +rt_mutex_slowlock(struct rt_mutex *lock, int state,
18424 + struct hrtimer_sleeper *timeout,
18425 + enum rtmutex_chainwalk chwalk,
18426 + struct ww_acquire_ctx *ww_ctx)
18428 + struct rt_mutex_waiter waiter;
18429 + unsigned long flags;
18432 + rt_mutex_init_waiter(&waiter, false);
18435 + * Technically we could use raw_spin_[un]lock_irq() here, but this can
18436 + * be called in early boot if the cmpxchg() fast path is disabled
18437 + * (debug, no architecture support). In this case we will acquire the
18438 + * rtmutex with lock->wait_lock held. But we cannot unconditionally
18439 + * enable interrupts in that early boot case. So we need to use the
18440 + * irqsave/restore variants.
18442 + raw_spin_lock_irqsave(&lock->wait_lock, flags);
18444 + ret = rt_mutex_slowlock_locked(lock, state, timeout, chwalk, ww_ctx,
18447 raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18449 @@ -1338,7 +1864,8 @@
18450 * Return whether the current task needs to call rt_mutex_postunlock().
18452 static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
18453 - struct wake_q_head *wake_q)
18454 + struct wake_q_head *wake_q,
18455 + struct wake_q_head *wake_sleeper_q)
18457 unsigned long flags;
18459 @@ -1392,7 +1919,7 @@
18461 * Queue the next waiter for wakeup once we release the wait_lock.
18463 - mark_wakeup_next_waiter(wake_q, lock);
18464 + mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
18465 raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18467 return true; /* call rt_mutex_postunlock() */
18468 @@ -1406,29 +1933,45 @@
18471 rt_mutex_fastlock(struct rt_mutex *lock, int state,
18472 + struct ww_acquire_ctx *ww_ctx,
18473 int (*slowfn)(struct rt_mutex *lock, int state,
18474 struct hrtimer_sleeper *timeout,
18475 - enum rtmutex_chainwalk chwalk))
18476 + enum rtmutex_chainwalk chwalk,
18477 + struct ww_acquire_ctx *ww_ctx))
18479 if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
18482 - return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
18484 + * If rt_mutex blocks, the function sched_submit_work will not call
18485 + * blk_schedule_flush_plug (because tsk_is_pi_blocked would be true).
18486 + * We must call blk_schedule_flush_plug here, if we don't call it,
18487 + * a deadlock in device mapper may happen.
18489 + if (unlikely(blk_needs_flush_plug(current)))
18490 + blk_schedule_flush_plug(current);
18492 + return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx);
18496 rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
18497 struct hrtimer_sleeper *timeout,
18498 enum rtmutex_chainwalk chwalk,
18499 + struct ww_acquire_ctx *ww_ctx,
18500 int (*slowfn)(struct rt_mutex *lock, int state,
18501 struct hrtimer_sleeper *timeout,
18502 - enum rtmutex_chainwalk chwalk))
18503 + enum rtmutex_chainwalk chwalk,
18504 + struct ww_acquire_ctx *ww_ctx))
18506 if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
18507 likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
18510 - return slowfn(lock, state, timeout, chwalk);
18511 + if (unlikely(blk_needs_flush_plug(current)))
18512 + blk_schedule_flush_plug(current);
18514 + return slowfn(lock, state, timeout, chwalk, ww_ctx);
18518 @@ -1444,9 +1987,11 @@
18520 * Performs the wakeup of the the top-waiter and re-enables preemption.
18522 -void rt_mutex_postunlock(struct wake_q_head *wake_q)
18523 +void rt_mutex_postunlock(struct wake_q_head *wake_q,
18524 + struct wake_q_head *wake_sleeper_q)
18527 + wake_up_q_sleeper(wake_sleeper_q);
18529 /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
18531 @@ -1455,15 +2000,40 @@
18533 rt_mutex_fastunlock(struct rt_mutex *lock,
18534 bool (*slowfn)(struct rt_mutex *lock,
18535 - struct wake_q_head *wqh))
18536 + struct wake_q_head *wqh,
18537 + struct wake_q_head *wq_sleeper))
18539 DEFINE_WAKE_Q(wake_q);
18540 + DEFINE_WAKE_Q(wake_sleeper_q);
18542 if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
18545 - if (slowfn(lock, &wake_q))
18546 - rt_mutex_postunlock(&wake_q);
18547 + if (slowfn(lock, &wake_q, &wake_sleeper_q))
18548 + rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
18551 +int __sched __rt_mutex_lock_state(struct rt_mutex *lock, int state)
18554 + return rt_mutex_fastlock(lock, state, NULL, rt_mutex_slowlock);
18558 + * rt_mutex_lock_state - lock a rt_mutex with a given state
18560 + * @lock: The rt_mutex to be locked
18561 + * @state: The state to set when blocking on the rt_mutex
18563 +static int __sched rt_mutex_lock_state(struct rt_mutex *lock, int state)
18567 + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18568 + ret = __rt_mutex_lock_state(lock, state);
18570 + mutex_release(&lock->dep_map, 1, _RET_IP_);
18575 @@ -1473,10 +2043,7 @@
18577 void __sched rt_mutex_lock(struct rt_mutex *lock)
18581 - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18582 - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
18583 + rt_mutex_lock_state(lock, TASK_UNINTERRUPTIBLE);
18585 EXPORT_SYMBOL_GPL(rt_mutex_lock);
18587 @@ -1491,16 +2058,7 @@
18589 int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
18595 - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18596 - ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
18598 - mutex_release(&lock->dep_map, 1, _RET_IP_);
18601 + return rt_mutex_lock_state(lock, TASK_INTERRUPTIBLE);
18603 EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
18605 @@ -1518,6 +2076,22 @@
18609 + * rt_mutex_lock_killable - lock a rt_mutex killable
18611 + * @lock: the rt_mutex to be locked
18612 + * @detect_deadlock: deadlock detection on/off
18616 + * -EINTR when interrupted by a signal
18618 +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
18620 + return rt_mutex_lock_state(lock, TASK_KILLABLE);
18622 +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
18625 * rt_mutex_timed_lock - lock a rt_mutex interruptible
18626 * the timeout structure is provided
18628 @@ -1540,6 +2114,7 @@
18629 mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18630 ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
18631 RT_MUTEX_MIN_CHAINWALK,
18633 rt_mutex_slowlock);
18635 mutex_release(&lock->dep_map, 1, _RET_IP_);
18636 @@ -1548,6 +2123,18 @@
18638 EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
18640 +int __sched __rt_mutex_trylock(struct rt_mutex *lock)
18642 +#ifdef CONFIG_PREEMPT_RT_FULL
18643 + if (WARN_ON_ONCE(in_irq() || in_nmi()))
18645 + if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
18649 + return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
18653 * rt_mutex_trylock - try to lock a rt_mutex
18655 @@ -1563,10 +2150,7 @@
18659 - if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
18662 - ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
18663 + ret = __rt_mutex_trylock(lock);
18665 mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18667 @@ -1574,6 +2158,11 @@
18669 EXPORT_SYMBOL_GPL(rt_mutex_trylock);
18671 +void __sched __rt_mutex_unlock(struct rt_mutex *lock)
18673 + rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
18677 * rt_mutex_unlock - unlock a rt_mutex
18679 @@ -1582,16 +2171,13 @@
18680 void __sched rt_mutex_unlock(struct rt_mutex *lock)
18682 mutex_release(&lock->dep_map, 1, _RET_IP_);
18683 - rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
18684 + __rt_mutex_unlock(lock);
18686 EXPORT_SYMBOL_GPL(rt_mutex_unlock);
18689 - * Futex variant, that since futex variants do not use the fast-path, can be
18690 - * simple and will not need to retry.
18692 -bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
18693 - struct wake_q_head *wake_q)
18694 +static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
18695 + struct wake_q_head *wake_q,
18696 + struct wake_q_head *wq_sleeper)
18698 lockdep_assert_held(&lock->wait_lock);
18700 @@ -1608,22 +2194,35 @@
18701 * avoid inversion prior to the wakeup. preempt_disable()
18702 * therein pairs with rt_mutex_postunlock().
18704 - mark_wakeup_next_waiter(wake_q, lock);
18705 + mark_wakeup_next_waiter(wake_q, wq_sleeper, lock);
18707 return true; /* call postunlock() */
18711 + * Futex variant, that since futex variants do not use the fast-path, can be
18712 + * simple and will not need to retry.
18714 +bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
18715 + struct wake_q_head *wake_q,
18716 + struct wake_q_head *wq_sleeper)
18718 + return __rt_mutex_unlock_common(lock, wake_q, wq_sleeper);
18721 void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
18723 DEFINE_WAKE_Q(wake_q);
18724 + DEFINE_WAKE_Q(wake_sleeper_q);
18725 + unsigned long flags;
18728 - raw_spin_lock_irq(&lock->wait_lock);
18729 - postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
18730 - raw_spin_unlock_irq(&lock->wait_lock);
18731 + raw_spin_lock_irqsave(&lock->wait_lock, flags);
18732 + postunlock = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q);
18733 + raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18736 - rt_mutex_postunlock(&wake_q);
18737 + rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
18741 @@ -1662,7 +2261,7 @@
18743 debug_rt_mutex_init(lock, name, key);
18745 -EXPORT_SYMBOL_GPL(__rt_mutex_init);
18746 +EXPORT_SYMBOL(__rt_mutex_init);
18749 * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
18750 @@ -1682,6 +2281,14 @@
18751 struct task_struct *proxy_owner)
18753 __rt_mutex_init(lock, NULL, NULL);
18754 +#ifdef CONFIG_DEBUG_SPINLOCK
18756 + * get another key class for the wait_lock. LOCK_PI and UNLOCK_PI is
18757 + * holding the ->wait_lock of the proxy_lock while unlocking a sleeping
18760 + raw_spin_lock_init(&lock->wait_lock);
18762 debug_rt_mutex_proxy_lock(lock, proxy_owner);
18763 rt_mutex_set_owner(lock, proxy_owner);
18765 @@ -1714,6 +2321,34 @@
18766 if (try_to_take_rt_mutex(lock, task, NULL))
18769 +#ifdef CONFIG_PREEMPT_RT_FULL
18771 + * In PREEMPT_RT there's an added race.
18772 + * If the task, that we are about to requeue, times out,
18773 + * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
18774 + * to skip this task. But right after the task sets
18775 + * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
18776 + * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
18777 + * This will replace the PI_WAKEUP_INPROGRESS with the actual
18778 + * lock that it blocks on. We *must not* place this task
18779 + * on this proxy lock in that case.
18781 + * To prevent this race, we first take the task's pi_lock
18782 + * and check if it has updated its pi_blocked_on. If it has,
18783 + * we assume that it woke up and we return -EAGAIN.
18784 + * Otherwise, we set the task's pi_blocked_on to
18785 + * PI_REQUEUE_INPROGRESS, so that if the task is waking up
18786 + * it will know that we are in the process of requeuing it.
18788 + raw_spin_lock(&task->pi_lock);
18789 + if (task->pi_blocked_on) {
18790 + raw_spin_unlock(&task->pi_lock);
18793 + task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
18794 + raw_spin_unlock(&task->pi_lock);
18797 /* We enforce deadlock detection for futexes */
18798 ret = task_blocks_on_rt_mutex(lock, waiter, task,
18799 RT_MUTEX_FULL_CHAINWALK);
18800 @@ -1728,7 +2363,7 @@
18804 - if (unlikely(ret))
18805 + if (ret && rt_mutex_has_waiters(lock))
18806 remove_waiter(lock, waiter);
18808 debug_rt_mutex_print_deadlock(waiter);
18809 @@ -1803,17 +2438,36 @@
18810 struct hrtimer_sleeper *to,
18811 struct rt_mutex_waiter *waiter)
18813 + struct task_struct *tsk = current;
18816 raw_spin_lock_irq(&lock->wait_lock);
18817 /* sleep on the mutex */
18818 set_current_state(TASK_INTERRUPTIBLE);
18819 - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
18820 + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
18822 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
18823 * have to fix that up.
18825 fixup_rt_mutex_waiters(lock);
18827 + * RT has a problem here when the wait got interrupted by a timeout
18828 + * or a signal. task->pi_blocked_on is still set. The task must
18829 + * acquire the hash bucket lock when returning from this function.
18831 + * If the hash bucket lock is contended then the
18832 + * BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)) in
18833 + * task_blocks_on_rt_mutex() will trigger. This can be avoided by
18834 + * clearing task->pi_blocked_on which removes the task from the
18835 + * boosting chain of the rtmutex. That's correct because the task
18836 + * is not longer blocked on it.
18839 + raw_spin_lock(&tsk->pi_lock);
18840 + tsk->pi_blocked_on = NULL;
18841 + raw_spin_unlock(&tsk->pi_lock);
18844 raw_spin_unlock_irq(&lock->wait_lock);
18847 @@ -1874,3 +2528,99 @@
18853 +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
18855 +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
18858 + if (ctx->deadlock_inject_countdown-- == 0) {
18859 + tmp = ctx->deadlock_inject_interval;
18860 + if (tmp > UINT_MAX/4)
18863 + tmp = tmp*2 + tmp + tmp/2;
18865 + ctx->deadlock_inject_interval = tmp;
18866 + ctx->deadlock_inject_countdown = tmp;
18867 + ctx->contending_lock = lock;
18869 + ww_mutex_unlock(lock);
18878 +#ifdef CONFIG_PREEMPT_RT_FULL
18880 +ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
18886 + mutex_acquire_nest(&lock->base.dep_map, 0, 0,
18887 + ctx ? &ctx->dep_map : NULL, _RET_IP_);
18888 + ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0,
18891 + mutex_release(&lock->base.dep_map, 1, _RET_IP_);
18892 + else if (!ret && ctx && ctx->acquired > 1)
18893 + return ww_mutex_deadlock_injection(lock, ctx);
18897 +EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
18900 +ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
18906 + mutex_acquire_nest(&lock->base.dep_map, 0, 0,
18907 + ctx ? &ctx->dep_map : NULL, _RET_IP_);
18908 + ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0,
18911 + mutex_release(&lock->base.dep_map, 1, _RET_IP_);
18912 + else if (!ret && ctx && ctx->acquired > 1)
18913 + return ww_mutex_deadlock_injection(lock, ctx);
18917 +EXPORT_SYMBOL_GPL(ww_mutex_lock);
18919 +void __sched ww_mutex_unlock(struct ww_mutex *lock)
18921 + int nest = !!lock->ctx;
18924 + * The unlocking fastpath is the 0->1 transition from 'locked'
18925 + * into 'unlocked' state:
18928 +#ifdef CONFIG_DEBUG_MUTEXES
18929 + DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
18931 + if (lock->ctx->acquired > 0)
18932 + lock->ctx->acquired--;
18933 + lock->ctx = NULL;
18936 + mutex_release(&lock->base.dep_map, nest, _RET_IP_);
18937 + __rt_mutex_unlock(&lock->base.lock);
18939 +EXPORT_SYMBOL(ww_mutex_unlock);
18941 +int __rt_mutex_owner_current(struct rt_mutex *lock)
18943 + return rt_mutex_owner(lock) == current;
18945 +EXPORT_SYMBOL(__rt_mutex_owner_current);
18947 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/rtmutex_common.h linux-4.14/kernel/locking/rtmutex_common.h
18948 --- linux-4.14.orig/kernel/locking/rtmutex_common.h 2018-09-05 11:03:22.000000000 +0200
18949 +++ linux-4.14/kernel/locking/rtmutex_common.h 2018-09-05 11:05:07.000000000 +0200
18952 #include <linux/rtmutex.h>
18953 #include <linux/sched/wake_q.h>
18954 +#include <linux/sched/debug.h>
18957 * This is the control structure for tasks blocked on a rt_mutex,
18959 struct rb_node pi_tree_entry;
18960 struct task_struct *task;
18961 struct rt_mutex *lock;
18963 #ifdef CONFIG_DEBUG_RT_MUTEXES
18965 struct pid *deadlock_task_pid;
18966 @@ -129,12 +131,15 @@
18968 * PI-futex support (proxy locking functions, etc.):
18970 +#define PI_WAKEUP_INPROGRESS ((struct rt_mutex_waiter *) 1)
18971 +#define PI_REQUEUE_INPROGRESS ((struct rt_mutex_waiter *) 2)
18973 extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
18974 extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
18975 struct task_struct *proxy_owner);
18976 extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
18977 struct task_struct *proxy_owner);
18978 -extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
18979 +extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate);
18980 extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
18981 struct rt_mutex_waiter *waiter,
18982 struct task_struct *task);
18983 @@ -152,9 +157,27 @@
18985 extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
18986 extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
18987 - struct wake_q_head *wqh);
18988 + struct wake_q_head *wqh,
18989 + struct wake_q_head *wq_sleeper);
18991 -extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
18992 +extern void rt_mutex_postunlock(struct wake_q_head *wake_q,
18993 + struct wake_q_head *wake_sleeper_q);
18995 +/* RW semaphore special interface */
18996 +struct ww_acquire_ctx;
18998 +extern int __rt_mutex_lock_state(struct rt_mutex *lock, int state);
18999 +extern int __rt_mutex_trylock(struct rt_mutex *lock);
19000 +extern void __rt_mutex_unlock(struct rt_mutex *lock);
19001 +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
19002 + struct hrtimer_sleeper *timeout,
19003 + enum rtmutex_chainwalk chwalk,
19004 + struct ww_acquire_ctx *ww_ctx,
19005 + struct rt_mutex_waiter *waiter);
19006 +void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock,
19007 + struct rt_mutex_waiter *waiter,
19008 + unsigned long flags);
19009 +void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock);
19011 #ifdef CONFIG_DEBUG_RT_MUTEXES
19012 # include "rtmutex-debug.h"
19013 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/rwlock-rt.c linux-4.14/kernel/locking/rwlock-rt.c
19014 --- linux-4.14.orig/kernel/locking/rwlock-rt.c 1970-01-01 01:00:00.000000000 +0100
19015 +++ linux-4.14/kernel/locking/rwlock-rt.c 2018-09-05 11:05:07.000000000 +0200
19019 +#include <linux/sched/debug.h>
19020 +#include <linux/export.h>
19022 +#include "rtmutex_common.h"
19023 +#include <linux/rwlock_types_rt.h>
19026 + * RT-specific reader/writer locks
19029 + * 1) Lock lock->rtmutex
19030 + * 2) Remove the reader BIAS to force readers into the slow path
19031 + * 3) Wait until all readers have left the critical region
19032 + * 4) Mark it write locked
19035 + * 1) Remove the write locked marker
19036 + * 2) Set the reader BIAS so readers can use the fast path again
19037 + * 3) Unlock lock->rtmutex to release blocked readers
19040 + * 1) Try fast path acquisition (reader BIAS is set)
19041 + * 2) Take lock->rtmutex.wait_lock which protects the writelocked flag
19042 + * 3) If !writelocked, acquire it for read
19043 + * 4) If writelocked, block on lock->rtmutex
19044 + * 5) unlock lock->rtmutex, goto 1)
19047 + * 1) Try fast path release (reader count != 1)
19048 + * 2) Wake the writer waiting in write_lock()#3
19050 + * read_lock()#3 has the consequence, that rw locks on RT are not writer
19051 + * fair, but writers, which should be avoided in RT tasks (think tasklist
19052 + * lock), are subject to the rtmutex priority/DL inheritance mechanism.
19054 + * It's possible to make the rw locks writer fair by keeping a list of
19055 + * active readers. A blocked writer would force all newly incoming readers
19056 + * to block on the rtmutex, but the rtmutex would have to be proxy locked
19057 + * for one reader after the other. We can't use multi-reader inheritance
19058 + * because there is no way to support that with
19059 + * SCHED_DEADLINE. Implementing the one by one reader boosting/handover
19060 + * mechanism is a major surgery for a very dubious value.
19062 + * The risk of writer starvation is there, but the pathological use cases
19063 + * which trigger it are not necessarily the typical RT workloads.
19066 +void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name,
19067 + struct lock_class_key *key)
19069 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
19071 + * Make sure we are not reinitializing a held semaphore:
19073 + debug_check_no_locks_freed((void *)lock, sizeof(*lock));
19074 + lockdep_init_map(&lock->dep_map, name, key, 0);
19076 + atomic_set(&lock->readers, READER_BIAS);
19077 + rt_mutex_init(&lock->rtmutex);
19078 + lock->rtmutex.save_state = 1;
19081 +int __read_rt_trylock(struct rt_rw_lock *lock)
19086 + * Increment reader count, if lock->readers < 0, i.e. READER_BIAS is
19089 + for (r = atomic_read(&lock->readers); r < 0;) {
19090 + old = atomic_cmpxchg(&lock->readers, r, r + 1);
19091 + if (likely(old == r))
19098 +void __sched __read_rt_lock(struct rt_rw_lock *lock)
19100 + struct rt_mutex *m = &lock->rtmutex;
19101 + struct rt_mutex_waiter waiter;
19102 + unsigned long flags;
19104 + if (__read_rt_trylock(lock))
19107 + raw_spin_lock_irqsave(&m->wait_lock, flags);
19109 + * Allow readers as long as the writer has not completely
19110 + * acquired the semaphore for write.
19112 + if (atomic_read(&lock->readers) != WRITER_BIAS) {
19113 + atomic_inc(&lock->readers);
19114 + raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19119 + * Call into the slow lock path with the rtmutex->wait_lock
19120 + * held, so this can't result in the following race:
19122 + * Reader1 Reader2 Writer
19125 + * rtmutex_lock(m)
19128 + * unlock(m->wait_lock)
19131 + * lock(m->wait_lock)
19132 + * lock->writelocked=true
19133 + * unlock(m->wait_lock)
19136 + * lock->writelocked=false
19137 + * rtmutex_unlock(m)
19140 + * rtmutex_lock(m)
19142 + * rtmutex_lock(m)
19144 + * That would put Reader1 behind the writer waiting on
19145 + * Reader2 to call read_unlock() which might be unbound.
19147 + rt_mutex_init_waiter(&waiter, false);
19148 + rt_spin_lock_slowlock_locked(m, &waiter, flags);
19150 + * The slowlock() above is guaranteed to return with the rtmutex is
19151 + * now held, so there can't be a writer active. Increment the reader
19152 + * count and immediately drop the rtmutex again.
19154 + atomic_inc(&lock->readers);
19155 + raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19156 + rt_spin_lock_slowunlock(m);
19158 + debug_rt_mutex_free_waiter(&waiter);
19161 +void __read_rt_unlock(struct rt_rw_lock *lock)
19163 + struct rt_mutex *m = &lock->rtmutex;
19164 + struct task_struct *tsk;
19167 + * sem->readers can only hit 0 when a writer is waiting for the
19168 + * active readers to leave the critical region.
19170 + if (!atomic_dec_and_test(&lock->readers))
19173 + raw_spin_lock_irq(&m->wait_lock);
19175 + * Wake the writer, i.e. the rtmutex owner. It might release the
19176 + * rtmutex concurrently in the fast path, but to clean up the rw
19177 + * lock it needs to acquire m->wait_lock. The worst case which can
19178 + * happen is a spurious wakeup.
19180 + tsk = rt_mutex_owner(m);
19182 + wake_up_process(tsk);
19184 + raw_spin_unlock_irq(&m->wait_lock);
19187 +static void __write_unlock_common(struct rt_rw_lock *lock, int bias,
19188 + unsigned long flags)
19190 + struct rt_mutex *m = &lock->rtmutex;
19192 + atomic_add(READER_BIAS - bias, &lock->readers);
19193 + raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19194 + rt_spin_lock_slowunlock(m);
19197 +void __sched __write_rt_lock(struct rt_rw_lock *lock)
19199 + struct rt_mutex *m = &lock->rtmutex;
19200 + struct task_struct *self = current;
19201 + unsigned long flags;
19203 + /* Take the rtmutex as a first step */
19204 + __rt_spin_lock(m);
19206 + /* Force readers into slow path */
19207 + atomic_sub(READER_BIAS, &lock->readers);
19209 + raw_spin_lock_irqsave(&m->wait_lock, flags);
19211 + raw_spin_lock(&self->pi_lock);
19212 + self->saved_state = self->state;
19213 + __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19214 + raw_spin_unlock(&self->pi_lock);
19217 + /* Have all readers left the critical region? */
19218 + if (!atomic_read(&lock->readers)) {
19219 + atomic_set(&lock->readers, WRITER_BIAS);
19220 + raw_spin_lock(&self->pi_lock);
19221 + __set_current_state_no_track(self->saved_state);
19222 + self->saved_state = TASK_RUNNING;
19223 + raw_spin_unlock(&self->pi_lock);
19224 + raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19228 + raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19230 + if (atomic_read(&lock->readers) != 0)
19233 + raw_spin_lock_irqsave(&m->wait_lock, flags);
19235 + raw_spin_lock(&self->pi_lock);
19236 + __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19237 + raw_spin_unlock(&self->pi_lock);
19241 +int __write_rt_trylock(struct rt_rw_lock *lock)
19243 + struct rt_mutex *m = &lock->rtmutex;
19244 + unsigned long flags;
19246 + if (!__rt_mutex_trylock(m))
19249 + atomic_sub(READER_BIAS, &lock->readers);
19251 + raw_spin_lock_irqsave(&m->wait_lock, flags);
19252 + if (!atomic_read(&lock->readers)) {
19253 + atomic_set(&lock->readers, WRITER_BIAS);
19254 + raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19257 + __write_unlock_common(lock, 0, flags);
19261 +void __write_rt_unlock(struct rt_rw_lock *lock)
19263 + struct rt_mutex *m = &lock->rtmutex;
19264 + unsigned long flags;
19266 + raw_spin_lock_irqsave(&m->wait_lock, flags);
19267 + __write_unlock_common(lock, WRITER_BIAS, flags);
19270 +/* Map the reader biased implementation */
19271 +static inline int do_read_rt_trylock(rwlock_t *rwlock)
19273 + return __read_rt_trylock(rwlock);
19276 +static inline int do_write_rt_trylock(rwlock_t *rwlock)
19278 + return __write_rt_trylock(rwlock);
19281 +static inline void do_read_rt_lock(rwlock_t *rwlock)
19283 + __read_rt_lock(rwlock);
19286 +static inline void do_write_rt_lock(rwlock_t *rwlock)
19288 + __write_rt_lock(rwlock);
19291 +static inline void do_read_rt_unlock(rwlock_t *rwlock)
19293 + __read_rt_unlock(rwlock);
19296 +static inline void do_write_rt_unlock(rwlock_t *rwlock)
19298 + __write_rt_unlock(rwlock);
19301 +static inline void do_rwlock_rt_init(rwlock_t *rwlock, const char *name,
19302 + struct lock_class_key *key)
19304 + __rwlock_biased_rt_init(rwlock, name, key);
19307 +int __lockfunc rt_read_can_lock(rwlock_t *rwlock)
19309 + return atomic_read(&rwlock->readers) < 0;
19312 +int __lockfunc rt_write_can_lock(rwlock_t *rwlock)
19314 + return atomic_read(&rwlock->readers) == READER_BIAS;
19318 + * The common functions which get wrapped into the rwlock API.
19320 +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
19324 + sleeping_lock_inc();
19325 + migrate_disable();
19326 + ret = do_read_rt_trylock(rwlock);
19328 + rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
19330 + migrate_enable();
19331 + sleeping_lock_dec();
19335 +EXPORT_SYMBOL(rt_read_trylock);
19337 +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
19341 + sleeping_lock_inc();
19342 + migrate_disable();
19343 + ret = do_write_rt_trylock(rwlock);
19345 + rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
19347 + migrate_enable();
19348 + sleeping_lock_dec();
19352 +EXPORT_SYMBOL(rt_write_trylock);
19354 +void __lockfunc rt_read_lock(rwlock_t *rwlock)
19356 + sleeping_lock_inc();
19357 + migrate_disable();
19358 + rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
19359 + do_read_rt_lock(rwlock);
19361 +EXPORT_SYMBOL(rt_read_lock);
19363 +void __lockfunc rt_write_lock(rwlock_t *rwlock)
19365 + sleeping_lock_inc();
19366 + migrate_disable();
19367 + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
19368 + do_write_rt_lock(rwlock);
19370 +EXPORT_SYMBOL(rt_write_lock);
19372 +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
19374 + rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
19375 + do_read_rt_unlock(rwlock);
19376 + migrate_enable();
19377 + sleeping_lock_dec();
19379 +EXPORT_SYMBOL(rt_read_unlock);
19381 +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
19383 + rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
19384 + do_write_rt_unlock(rwlock);
19385 + migrate_enable();
19386 + sleeping_lock_dec();
19388 +EXPORT_SYMBOL(rt_write_unlock);
19390 +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
19392 + do_rwlock_rt_init(rwlock, name, key);
19394 +EXPORT_SYMBOL(__rt_rwlock_init);
19395 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/rwsem-rt.c linux-4.14/kernel/locking/rwsem-rt.c
19396 --- linux-4.14.orig/kernel/locking/rwsem-rt.c 1970-01-01 01:00:00.000000000 +0100
19397 +++ linux-4.14/kernel/locking/rwsem-rt.c 2018-09-05 11:05:07.000000000 +0200
19401 +#include <linux/rwsem.h>
19402 +#include <linux/sched/debug.h>
19403 +#include <linux/sched/signal.h>
19404 +#include <linux/export.h>
19406 +#include "rtmutex_common.h"
19409 + * RT-specific reader/writer semaphores
19412 + * 1) Lock sem->rtmutex
19413 + * 2) Remove the reader BIAS to force readers into the slow path
19414 + * 3) Wait until all readers have left the critical region
19415 + * 4) Mark it write locked
19418 + * 1) Remove the write locked marker
19419 + * 2) Set the reader BIAS so readers can use the fast path again
19420 + * 3) Unlock sem->rtmutex to release blocked readers
19423 + * 1) Try fast path acquisition (reader BIAS is set)
19424 + * 2) Take sem->rtmutex.wait_lock which protects the writelocked flag
19425 + * 3) If !writelocked, acquire it for read
19426 + * 4) If writelocked, block on sem->rtmutex
19427 + * 5) unlock sem->rtmutex, goto 1)
19430 + * 1) Try fast path release (reader count != 1)
19431 + * 2) Wake the writer waiting in down_write()#3
19433 + * down_read()#3 has the consequence, that rw semaphores on RT are not writer
19434 + * fair, but writers, which should be avoided in RT tasks (think mmap_sem),
19435 + * are subject to the rtmutex priority/DL inheritance mechanism.
19437 + * It's possible to make the rw semaphores writer fair by keeping a list of
19438 + * active readers. A blocked writer would force all newly incoming readers to
19439 + * block on the rtmutex, but the rtmutex would have to be proxy locked for one
19440 + * reader after the other. We can't use multi-reader inheritance because there
19441 + * is no way to support that with SCHED_DEADLINE. Implementing the one by one
19442 + * reader boosting/handover mechanism is a major surgery for a very dubious
19445 + * The risk of writer starvation is there, but the pathological use cases
19446 + * which trigger it are not necessarily the typical RT workloads.
19449 +void __rwsem_init(struct rw_semaphore *sem, const char *name,
19450 + struct lock_class_key *key)
19452 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
19454 + * Make sure we are not reinitializing a held semaphore:
19456 + debug_check_no_locks_freed((void *)sem, sizeof(*sem));
19457 + lockdep_init_map(&sem->dep_map, name, key, 0);
19459 + atomic_set(&sem->readers, READER_BIAS);
19461 +EXPORT_SYMBOL(__rwsem_init);
19463 +int __down_read_trylock(struct rw_semaphore *sem)
19468 + * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is
19471 + for (r = atomic_read(&sem->readers); r < 0;) {
19472 + old = atomic_cmpxchg(&sem->readers, r, r + 1);
19473 + if (likely(old == r))
19480 +void __sched __down_read(struct rw_semaphore *sem)
19482 + struct rt_mutex *m = &sem->rtmutex;
19483 + struct rt_mutex_waiter waiter;
19485 + if (__down_read_trylock(sem))
19489 + raw_spin_lock_irq(&m->wait_lock);
19491 + * Allow readers as long as the writer has not completely
19492 + * acquired the semaphore for write.
19494 + if (atomic_read(&sem->readers) != WRITER_BIAS) {
19495 + atomic_inc(&sem->readers);
19496 + raw_spin_unlock_irq(&m->wait_lock);
19501 + * Call into the slow lock path with the rtmutex->wait_lock
19502 + * held, so this can't result in the following race:
19504 + * Reader1 Reader2 Writer
19507 + * rtmutex_lock(m)
19510 + * unlock(m->wait_lock)
19513 + * lock(m->wait_lock)
19514 + * sem->writelocked=true
19515 + * unlock(m->wait_lock)
19518 + * sem->writelocked=false
19519 + * rtmutex_unlock(m)
19522 + * rtmutex_lock(m)
19524 + * rtmutex_lock(m)
19526 + * That would put Reader1 behind the writer waiting on
19527 + * Reader2 to call up_read() which might be unbound.
19529 + rt_mutex_init_waiter(&waiter, false);
19530 + rt_mutex_slowlock_locked(m, TASK_UNINTERRUPTIBLE, NULL,
19531 + RT_MUTEX_MIN_CHAINWALK, NULL,
19534 + * The slowlock() above is guaranteed to return with the rtmutex is
19535 + * now held, so there can't be a writer active. Increment the reader
19536 + * count and immediately drop the rtmutex again.
19538 + atomic_inc(&sem->readers);
19539 + raw_spin_unlock_irq(&m->wait_lock);
19540 + __rt_mutex_unlock(m);
19542 + debug_rt_mutex_free_waiter(&waiter);
19545 +void __up_read(struct rw_semaphore *sem)
19547 + struct rt_mutex *m = &sem->rtmutex;
19548 + struct task_struct *tsk;
19551 + * sem->readers can only hit 0 when a writer is waiting for the
19552 + * active readers to leave the critical region.
19554 + if (!atomic_dec_and_test(&sem->readers))
19558 + raw_spin_lock_irq(&m->wait_lock);
19560 + * Wake the writer, i.e. the rtmutex owner. It might release the
19561 + * rtmutex concurrently in the fast path (due to a signal), but to
19562 + * clean up the rwsem it needs to acquire m->wait_lock. The worst
19563 + * case which can happen is a spurious wakeup.
19565 + tsk = rt_mutex_owner(m);
19567 + wake_up_process(tsk);
19569 + raw_spin_unlock_irq(&m->wait_lock);
19572 +static void __up_write_unlock(struct rw_semaphore *sem, int bias,
19573 + unsigned long flags)
19575 + struct rt_mutex *m = &sem->rtmutex;
19577 + atomic_add(READER_BIAS - bias, &sem->readers);
19578 + raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19579 + __rt_mutex_unlock(m);
19582 +static int __sched __down_write_common(struct rw_semaphore *sem, int state)
19584 + struct rt_mutex *m = &sem->rtmutex;
19585 + unsigned long flags;
19587 + /* Take the rtmutex as a first step */
19588 + if (__rt_mutex_lock_state(m, state))
19591 + /* Force readers into slow path */
19592 + atomic_sub(READER_BIAS, &sem->readers);
19595 + set_current_state(state);
19597 + raw_spin_lock_irqsave(&m->wait_lock, flags);
19598 + /* Have all readers left the critical region? */
19599 + if (!atomic_read(&sem->readers)) {
19600 + atomic_set(&sem->readers, WRITER_BIAS);
19601 + __set_current_state(TASK_RUNNING);
19602 + raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19606 + if (signal_pending_state(state, current)) {
19607 + __set_current_state(TASK_RUNNING);
19608 + __up_write_unlock(sem, 0, flags);
19611 + raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19613 + if (atomic_read(&sem->readers) != 0) {
19615 + set_current_state(state);
19620 +void __sched __down_write(struct rw_semaphore *sem)
19622 + __down_write_common(sem, TASK_UNINTERRUPTIBLE);
19625 +int __sched __down_write_killable(struct rw_semaphore *sem)
19627 + return __down_write_common(sem, TASK_KILLABLE);
19630 +int __down_write_trylock(struct rw_semaphore *sem)
19632 + struct rt_mutex *m = &sem->rtmutex;
19633 + unsigned long flags;
19635 + if (!__rt_mutex_trylock(m))
19638 + atomic_sub(READER_BIAS, &sem->readers);
19640 + raw_spin_lock_irqsave(&m->wait_lock, flags);
19641 + if (!atomic_read(&sem->readers)) {
19642 + atomic_set(&sem->readers, WRITER_BIAS);
19643 + raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19646 + __up_write_unlock(sem, 0, flags);
19650 +void __up_write(struct rw_semaphore *sem)
19652 + struct rt_mutex *m = &sem->rtmutex;
19653 + unsigned long flags;
19655 + raw_spin_lock_irqsave(&m->wait_lock, flags);
19656 + __up_write_unlock(sem, WRITER_BIAS, flags);
19659 +void __downgrade_write(struct rw_semaphore *sem)
19661 + struct rt_mutex *m = &sem->rtmutex;
19662 + unsigned long flags;
19664 + raw_spin_lock_irqsave(&m->wait_lock, flags);
19665 + /* Release it and account current as reader */
19666 + __up_write_unlock(sem, WRITER_BIAS - 1, flags);
19668 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/spinlock.c linux-4.14/kernel/locking/spinlock.c
19669 --- linux-4.14.orig/kernel/locking/spinlock.c 2017-11-12 19:46:13.000000000 +0100
19670 +++ linux-4.14/kernel/locking/spinlock.c 2018-09-05 11:05:07.000000000 +0200
19671 @@ -125,8 +125,11 @@
19672 * __[spin|read|write]_lock_bh()
19674 BUILD_LOCK_OPS(spin, raw_spinlock);
19676 +#ifndef CONFIG_PREEMPT_RT_FULL
19677 BUILD_LOCK_OPS(read, rwlock);
19678 BUILD_LOCK_OPS(write, rwlock);
19683 @@ -210,6 +213,8 @@
19684 EXPORT_SYMBOL(_raw_spin_unlock_bh);
19687 +#ifndef CONFIG_PREEMPT_RT_FULL
19689 #ifndef CONFIG_INLINE_READ_TRYLOCK
19690 int __lockfunc _raw_read_trylock(rwlock_t *lock)
19692 @@ -354,6 +359,8 @@
19693 EXPORT_SYMBOL(_raw_write_unlock_bh);
19696 +#endif /* !PREEMPT_RT_FULL */
19698 #ifdef CONFIG_DEBUG_LOCK_ALLOC
19700 void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
19701 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/spinlock_debug.c linux-4.14/kernel/locking/spinlock_debug.c
19702 --- linux-4.14.orig/kernel/locking/spinlock_debug.c 2017-11-12 19:46:13.000000000 +0100
19703 +++ linux-4.14/kernel/locking/spinlock_debug.c 2018-09-05 11:05:07.000000000 +0200
19706 EXPORT_SYMBOL(__raw_spin_lock_init);
19708 +#ifndef CONFIG_PREEMPT_RT_FULL
19709 void __rwlock_init(rwlock_t *lock, const char *name,
19710 struct lock_class_key *key)
19715 EXPORT_SYMBOL(__rwlock_init);
19718 static void spin_dump(raw_spinlock_t *lock, const char *msg)
19720 @@ -135,6 +137,7 @@
19721 arch_spin_unlock(&lock->raw_lock);
19724 +#ifndef CONFIG_PREEMPT_RT_FULL
19725 static void rwlock_bug(rwlock_t *lock, const char *msg)
19727 if (!debug_locks_off())
19728 @@ -224,3 +227,5 @@
19729 debug_write_unlock(lock);
19730 arch_write_unlock(&lock->raw_lock);
19734 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/panic.c linux-4.14/kernel/panic.c
19735 --- linux-4.14.orig/kernel/panic.c 2017-11-12 19:46:13.000000000 +0100
19736 +++ linux-4.14/kernel/panic.c 2018-09-05 11:05:07.000000000 +0200
19737 @@ -482,9 +482,11 @@
19739 static int init_oops_id(void)
19741 +#ifndef CONFIG_PREEMPT_RT_FULL
19743 get_random_bytes(&oops_id, sizeof(oops_id));
19749 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/power/hibernate.c linux-4.14/kernel/power/hibernate.c
19750 --- linux-4.14.orig/kernel/power/hibernate.c 2017-11-12 19:46:13.000000000 +0100
19751 +++ linux-4.14/kernel/power/hibernate.c 2018-09-05 11:05:07.000000000 +0200
19752 @@ -287,6 +287,8 @@
19754 local_irq_disable();
19756 + system_state = SYSTEM_SUSPEND;
19758 error = syscore_suspend();
19760 pr_err("Some system devices failed to power down, aborting hibernation\n");
19761 @@ -317,6 +319,7 @@
19765 + system_state = SYSTEM_RUNNING;
19766 local_irq_enable();
19769 @@ -445,6 +448,7 @@
19772 local_irq_disable();
19773 + system_state = SYSTEM_SUSPEND;
19775 error = syscore_suspend();
19777 @@ -478,6 +482,7 @@
19781 + system_state = SYSTEM_RUNNING;
19782 local_irq_enable();
19785 @@ -563,6 +568,7 @@
19788 local_irq_disable();
19789 + system_state = SYSTEM_SUSPEND;
19791 if (pm_wakeup_pending()) {
19793 @@ -575,6 +581,7 @@
19797 + system_state = SYSTEM_RUNNING;
19798 local_irq_enable();
19801 @@ -672,6 +679,10 @@
19805 +#ifndef CONFIG_SUSPEND
19806 +bool pm_in_action;
19810 * hibernate - Carry out system hibernation, including saving the image.
19812 @@ -685,6 +696,8 @@
19816 + pm_in_action = true;
19818 lock_system_sleep();
19819 /* The snapshot device should not be opened while we're running */
19820 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
19821 @@ -763,6 +776,7 @@
19822 atomic_inc(&snapshot_device_available);
19824 unlock_system_sleep();
19825 + pm_in_action = false;
19826 pr_info("hibernation exit\n");
19829 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/power/suspend.c linux-4.14/kernel/power/suspend.c
19830 --- linux-4.14.orig/kernel/power/suspend.c 2018-09-05 11:03:22.000000000 +0200
19831 +++ linux-4.14/kernel/power/suspend.c 2018-09-05 11:05:07.000000000 +0200
19832 @@ -428,6 +428,8 @@
19833 arch_suspend_disable_irqs();
19834 BUG_ON(!irqs_disabled());
19836 + system_state = SYSTEM_SUSPEND;
19838 error = syscore_suspend();
19840 *wakeup = pm_wakeup_pending();
19841 @@ -443,6 +445,8 @@
19845 + system_state = SYSTEM_RUNNING;
19847 arch_suspend_enable_irqs();
19848 BUG_ON(irqs_disabled());
19850 @@ -589,6 +593,8 @@
19854 +bool pm_in_action;
19857 * pm_suspend - Externally visible function for suspending the system.
19858 * @state: System sleep state to enter.
19859 @@ -603,6 +609,7 @@
19860 if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
19863 + pm_in_action = true;
19864 pr_info("suspend entry (%s)\n", mem_sleep_labels[state]);
19865 error = enter_state(state);
19867 @@ -612,6 +619,7 @@
19868 suspend_stats.success++;
19870 pr_info("suspend exit\n");
19871 + pm_in_action = false;
19874 EXPORT_SYMBOL(pm_suspend);
19875 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/printk/printk.c linux-4.14/kernel/printk/printk.c
19876 --- linux-4.14.orig/kernel/printk/printk.c 2017-11-12 19:46:13.000000000 +0100
19877 +++ linux-4.14/kernel/printk/printk.c 2018-09-05 11:05:07.000000000 +0200
19878 @@ -400,6 +400,65 @@
19879 printk_safe_exit_irqrestore(flags); \
19882 +#ifdef CONFIG_EARLY_PRINTK
19883 +struct console *early_console;
19885 +static void early_vprintk(const char *fmt, va_list ap)
19887 + if (early_console) {
19889 + int n = vscnprintf(buf, sizeof(buf), fmt, ap);
19891 + early_console->write(early_console, buf, n);
19895 +asmlinkage void early_printk(const char *fmt, ...)
19899 + va_start(ap, fmt);
19900 + early_vprintk(fmt, ap);
19905 + * This is independent of any log levels - a global
19906 + * kill switch that turns off all of printk.
19908 + * Used by the NMI watchdog if early-printk is enabled.
19910 +static bool __read_mostly printk_killswitch;
19912 +static int __init force_early_printk_setup(char *str)
19914 + printk_killswitch = true;
19917 +early_param("force_early_printk", force_early_printk_setup);
19919 +void printk_kill(void)
19921 + printk_killswitch = true;
19924 +#ifdef CONFIG_PRINTK
19925 +static int forced_early_printk(const char *fmt, va_list ap)
19927 + if (!printk_killswitch)
19929 + early_vprintk(fmt, ap);
19935 +static inline int forced_early_printk(const char *fmt, va_list ap)
19941 #ifdef CONFIG_PRINTK
19942 DECLARE_WAIT_QUEUE_HEAD(log_wait);
19943 /* the next printk record to read by syslog(READ) or /proc/kmsg */
19944 @@ -1348,6 +1407,8 @@
19948 + int attempts = 0;
19951 text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
19953 @@ -1359,6 +1420,14 @@
19959 + if (attempts > 10) {
19966 * Find first record that fits, including all following records,
19967 * into the user-provided buffer for this dump.
19968 @@ -1371,6 +1440,14 @@
19969 len += msg_print_text(msg, true, NULL, 0);
19970 idx = log_next(idx);
19973 + if (num_msg > 5) {
19975 + logbuf_unlock_irq();
19976 + logbuf_lock_irq();
19977 + if (clear_seq < log_first_seq)
19982 /* move first record forward until length fits into the buffer */
19983 @@ -1382,6 +1459,14 @@
19984 len -= msg_print_text(msg, true, NULL, 0);
19985 idx = log_next(idx);
19988 + if (num_msg > 5) {
19990 + logbuf_unlock_irq();
19991 + logbuf_lock_irq();
19992 + if (clear_seq < log_first_seq)
19997 /* last message fitting into this dump */
19998 @@ -1420,6 +1505,7 @@
19999 clear_seq = log_next_seq;
20000 clear_idx = log_next_idx;
20003 logbuf_unlock_irq();
20006 @@ -1558,6 +1644,12 @@
20007 if (!console_drivers)
20010 + if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
20011 + if (in_irq() || in_nmi())
20015 + migrate_disable();
20016 for_each_console(con) {
20017 if (exclusive_console && con != exclusive_console)
20019 @@ -1573,6 +1665,7 @@
20021 con->write(con, text, len);
20023 + migrate_enable();
20026 int printk_delay_msec __read_mostly;
20027 @@ -1692,6 +1785,13 @@
20029 bool in_sched = false;
20032 + * Fall back to early_printk if a debugging subsystem has
20033 + * killed printk output
20035 + if (unlikely(forced_early_printk(fmt, args)))
20038 if (level == LOGLEVEL_SCHED) {
20039 level = LOGLEVEL_DEFAULT;
20041 @@ -1748,12 +1848,22 @@
20043 /* If called from the scheduler, we can not call up(). */
20045 + int may_trylock = 1;
20047 +#ifdef CONFIG_PREEMPT_RT_FULL
20049 + * we can't take a sleeping lock with IRQs or preeption disabled
20050 + * so we can't print in these contexts
20052 + if (!(preempt_count() == 0 && !irqs_disabled()))
20056 * Try to acquire and then immediately release the console
20057 * semaphore. The release will print out buffers and wake up
20058 * /dev/kmsg and syslog() users.
20060 - if (console_trylock())
20061 + if (may_trylock && console_trylock())
20065 @@ -1863,26 +1973,6 @@
20067 #endif /* CONFIG_PRINTK */
20069 -#ifdef CONFIG_EARLY_PRINTK
20070 -struct console *early_console;
20072 -asmlinkage __visible void early_printk(const char *fmt, ...)
20078 - if (!early_console)
20081 - va_start(ap, fmt);
20082 - n = vscnprintf(buf, sizeof(buf), fmt, ap);
20085 - early_console->write(early_console, buf, n);
20089 static int __add_preferred_console(char *name, int idx, char *options,
20092 @@ -2229,10 +2319,15 @@
20094 raw_spin_unlock(&logbuf_lock);
20096 +#ifdef CONFIG_PREEMPT_RT_FULL
20097 + printk_safe_exit_irqrestore(flags);
20098 + call_console_drivers(ext_text, ext_len, text, len);
20100 stop_critical_timings(); /* don't trace print latency */
20101 call_console_drivers(ext_text, ext_len, text, len);
20102 start_critical_timings();
20103 printk_safe_exit_irqrestore(flags);
20106 if (do_cond_resched)
20108 @@ -2286,6 +2381,11 @@
20112 + if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
20113 + if (in_irq() || in_nmi())
20118 * console_unblank can no longer be called in interrupt context unless
20119 * oops_in_progress is set to 1..
20120 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/ptrace.c linux-4.14/kernel/ptrace.c
20121 --- linux-4.14.orig/kernel/ptrace.c 2017-11-12 19:46:13.000000000 +0100
20122 +++ linux-4.14/kernel/ptrace.c 2018-09-05 11:05:07.000000000 +0200
20123 @@ -175,7 +175,14 @@
20125 spin_lock_irq(&task->sighand->siglock);
20126 if (task_is_traced(task) && !__fatal_signal_pending(task)) {
20127 - task->state = __TASK_TRACED;
20128 + unsigned long flags;
20130 + raw_spin_lock_irqsave(&task->pi_lock, flags);
20131 + if (task->state & __TASK_TRACED)
20132 + task->state = __TASK_TRACED;
20134 + task->saved_state = __TASK_TRACED;
20135 + raw_spin_unlock_irqrestore(&task->pi_lock, flags);
20138 spin_unlock_irq(&task->sighand->siglock);
20139 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/Kconfig linux-4.14/kernel/rcu/Kconfig
20140 --- linux-4.14.orig/kernel/rcu/Kconfig 2017-11-12 19:46:13.000000000 +0100
20141 +++ linux-4.14/kernel/rcu/Kconfig 2018-09-05 11:05:07.000000000 +0200
20145 bool "Make expert-level adjustments to RCU configuration"
20147 + default y if PREEMPT_RT_FULL
20149 This option needs to be enabled if you wish to make
20150 expert-level adjustments to RCU configuration. By default,
20151 @@ -172,7 +172,7 @@
20153 config RCU_FAST_NO_HZ
20154 bool "Accelerate last non-dyntick-idle CPU's grace periods"
20155 - depends on NO_HZ_COMMON && SMP && RCU_EXPERT
20156 + depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
20159 This option permits CPUs to enter dynticks-idle state even if
20160 @@ -191,7 +191,7 @@
20162 bool "Enable RCU priority boosting"
20163 depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
20165 + default y if PREEMPT_RT_FULL
20167 This option boosts the priority of preempted RCU readers that
20168 block the current preemptible RCU grace period for too long.
20169 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/rcu.h linux-4.14/kernel/rcu/rcu.h
20170 --- linux-4.14.orig/kernel/rcu/rcu.h 2017-11-12 19:46:13.000000000 +0100
20171 +++ linux-4.14/kernel/rcu/rcu.h 2018-09-05 11:05:07.000000000 +0200
20172 @@ -462,18 +462,26 @@
20173 extern unsigned long rcutorture_testseq;
20174 extern unsigned long rcutorture_vernum;
20175 unsigned long rcu_batches_started(void);
20176 -unsigned long rcu_batches_started_bh(void);
20177 unsigned long rcu_batches_started_sched(void);
20178 unsigned long rcu_batches_completed(void);
20179 -unsigned long rcu_batches_completed_bh(void);
20180 unsigned long rcu_batches_completed_sched(void);
20181 unsigned long rcu_exp_batches_completed(void);
20182 unsigned long rcu_exp_batches_completed_sched(void);
20183 unsigned long srcu_batches_completed(struct srcu_struct *sp);
20184 void show_rcu_gp_kthreads(void);
20185 void rcu_force_quiescent_state(void);
20186 -void rcu_bh_force_quiescent_state(void);
20187 void rcu_sched_force_quiescent_state(void);
20189 +#ifndef CONFIG_PREEMPT_RT_FULL
20190 +void rcu_bh_force_quiescent_state(void);
20191 +unsigned long rcu_batches_started_bh(void);
20192 +unsigned long rcu_batches_completed_bh(void);
20194 +# define rcu_bh_force_quiescent_state rcu_force_quiescent_state
20195 +# define rcu_batches_completed_bh rcu_batches_completed
20196 +# define rcu_batches_started_bh rcu_batches_completed
20199 #endif /* #else #ifdef CONFIG_TINY_RCU */
20201 #ifdef CONFIG_RCU_NOCB_CPU
20202 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/rcu_segcblist.c linux-4.14/kernel/rcu/rcu_segcblist.c
20203 --- linux-4.14.orig/kernel/rcu/rcu_segcblist.c 2017-11-12 19:46:13.000000000 +0100
20204 +++ linux-4.14/kernel/rcu/rcu_segcblist.c 2018-09-05 11:05:07.000000000 +0200
20206 #include <linux/types.h>
20207 #include <linux/kernel.h>
20208 #include <linux/interrupt.h>
20209 +#include <linux/rcupdate.h>
20211 #include "rcu_segcblist.h"
20213 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/rcutorture.c linux-4.14/kernel/rcu/rcutorture.c
20214 --- linux-4.14.orig/kernel/rcu/rcutorture.c 2017-11-12 19:46:13.000000000 +0100
20215 +++ linux-4.14/kernel/rcu/rcutorture.c 2018-09-05 11:05:07.000000000 +0200
20216 @@ -417,6 +417,7 @@
20220 +#ifndef CONFIG_PREEMPT_RT_FULL
20222 * Definitions for rcu_bh torture testing.
20224 @@ -456,6 +457,12 @@
20229 +static struct rcu_torture_ops rcu_bh_ops = {
20230 + .ttype = INVALID_RCU_FLAVOR,
20235 * Don't even think about trying any of these in real life!!!
20236 * The names includes "busted", and they really means it!
20237 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/srcutree.c linux-4.14/kernel/rcu/srcutree.c
20238 --- linux-4.14.orig/kernel/rcu/srcutree.c 2017-11-12 19:46:13.000000000 +0100
20239 +++ linux-4.14/kernel/rcu/srcutree.c 2018-09-05 11:05:07.000000000 +0200
20241 #include <linux/delay.h>
20242 #include <linux/module.h>
20243 #include <linux/srcu.h>
20244 +#include <linux/cpu.h>
20245 +#include <linux/locallock.h>
20248 #include "rcu_segcblist.h"
20250 static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
20251 static void process_srcu(struct work_struct *work);
20253 +/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
20254 +#define spin_lock_rcu_node(p) \
20256 + spin_lock(&ACCESS_PRIVATE(p, lock)); \
20257 + smp_mb__after_unlock_lock(); \
20260 +#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock))
20262 +#define spin_lock_irq_rcu_node(p) \
20264 + spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
20265 + smp_mb__after_unlock_lock(); \
20268 +#define spin_unlock_irq_rcu_node(p) \
20269 + spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
20271 +#define spin_lock_irqsave_rcu_node(p, flags) \
20273 + spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
20274 + smp_mb__after_unlock_lock(); \
20277 +#define spin_unlock_irqrestore_rcu_node(p, flags) \
20278 + spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
20281 * Initialize SRCU combining tree. Note that statically allocated
20282 * srcu_struct structures might already have srcu_read_lock() and
20285 /* Each pass through this loop initializes one srcu_node structure. */
20286 rcu_for_each_node_breadth_first(sp, snp) {
20287 - raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock));
20288 + spin_lock_init(&ACCESS_PRIVATE(snp, lock));
20289 WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
20290 ARRAY_SIZE(snp->srcu_data_have_cbs));
20291 for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
20292 @@ -111,7 +140,7 @@
20293 snp_first = sp->level[level];
20294 for_each_possible_cpu(cpu) {
20295 sdp = per_cpu_ptr(sp->sda, cpu);
20296 - raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
20297 + spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
20298 rcu_segcblist_init(&sdp->srcu_cblist);
20299 sdp->srcu_cblist_invoking = false;
20300 sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
20301 @@ -170,7 +199,7 @@
20302 /* Don't re-initialize a lock while it is held. */
20303 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
20304 lockdep_init_map(&sp->dep_map, name, key, 0);
20305 - raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20306 + spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20307 return init_srcu_struct_fields(sp, false);
20309 EXPORT_SYMBOL_GPL(__init_srcu_struct);
20310 @@ -187,7 +216,7 @@
20312 int init_srcu_struct(struct srcu_struct *sp)
20314 - raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20315 + spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20316 return init_srcu_struct_fields(sp, false);
20318 EXPORT_SYMBOL_GPL(init_srcu_struct);
20319 @@ -210,13 +239,13 @@
20320 /* The smp_load_acquire() pairs with the smp_store_release(). */
20321 if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
20322 return; /* Already initialized. */
20323 - raw_spin_lock_irqsave_rcu_node(sp, flags);
20324 + spin_lock_irqsave_rcu_node(sp, flags);
20325 if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
20326 - raw_spin_unlock_irqrestore_rcu_node(sp, flags);
20327 + spin_unlock_irqrestore_rcu_node(sp, flags);
20330 init_srcu_struct_fields(sp, true);
20331 - raw_spin_unlock_irqrestore_rcu_node(sp, flags);
20332 + spin_unlock_irqrestore_rcu_node(sp, flags);
20336 @@ -425,21 +454,6 @@
20340 - * Track online CPUs to guide callback workqueue placement.
20342 -DEFINE_PER_CPU(bool, srcu_online);
20344 -void srcu_online_cpu(unsigned int cpu)
20346 - WRITE_ONCE(per_cpu(srcu_online, cpu), true);
20349 -void srcu_offline_cpu(unsigned int cpu)
20351 - WRITE_ONCE(per_cpu(srcu_online, cpu), false);
20355 * Place the workqueue handler on the specified CPU if online, otherwise
20356 * just run it whereever. This is useful for placing workqueue handlers
20357 * that are to invoke the specified CPU's callbacks.
20358 @@ -450,12 +464,12 @@
20362 - preempt_disable();
20363 - if (READ_ONCE(per_cpu(srcu_online, cpu)))
20364 + cpus_read_lock();
20365 + if (cpu_online(cpu))
20366 ret = queue_delayed_work_on(cpu, wq, dwork, delay);
20368 ret = queue_delayed_work(wq, dwork, delay);
20369 - preempt_enable();
20370 + cpus_read_unlock();
20374 @@ -513,7 +527,7 @@
20375 mutex_lock(&sp->srcu_cb_mutex);
20377 /* End the current grace period. */
20378 - raw_spin_lock_irq_rcu_node(sp);
20379 + spin_lock_irq_rcu_node(sp);
20380 idx = rcu_seq_state(sp->srcu_gp_seq);
20381 WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
20382 cbdelay = srcu_get_delay(sp);
20383 @@ -522,7 +536,7 @@
20384 gpseq = rcu_seq_current(&sp->srcu_gp_seq);
20385 if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
20386 sp->srcu_gp_seq_needed_exp = gpseq;
20387 - raw_spin_unlock_irq_rcu_node(sp);
20388 + spin_unlock_irq_rcu_node(sp);
20389 mutex_unlock(&sp->srcu_gp_mutex);
20390 /* A new grace period can start at this point. But only one. */
20392 @@ -530,7 +544,7 @@
20393 idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
20394 idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
20395 rcu_for_each_node_breadth_first(sp, snp) {
20396 - raw_spin_lock_irq_rcu_node(snp);
20397 + spin_lock_irq_rcu_node(snp);
20399 if (snp >= sp->level[rcu_num_lvls - 1])
20400 cbs = snp->srcu_have_cbs[idx] == gpseq;
20401 @@ -540,7 +554,7 @@
20402 snp->srcu_gp_seq_needed_exp = gpseq;
20403 mask = snp->srcu_data_have_cbs[idx];
20404 snp->srcu_data_have_cbs[idx] = 0;
20405 - raw_spin_unlock_irq_rcu_node(snp);
20406 + spin_unlock_irq_rcu_node(snp);
20408 srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
20410 @@ -548,11 +562,11 @@
20411 if (!(gpseq & counter_wrap_check))
20412 for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
20413 sdp = per_cpu_ptr(sp->sda, cpu);
20414 - raw_spin_lock_irqsave_rcu_node(sdp, flags);
20415 + spin_lock_irqsave_rcu_node(sdp, flags);
20416 if (ULONG_CMP_GE(gpseq,
20417 sdp->srcu_gp_seq_needed + 100))
20418 sdp->srcu_gp_seq_needed = gpseq;
20419 - raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
20420 + spin_unlock_irqrestore_rcu_node(sdp, flags);
20424 @@ -560,17 +574,17 @@
20425 mutex_unlock(&sp->srcu_cb_mutex);
20427 /* Start a new grace period if needed. */
20428 - raw_spin_lock_irq_rcu_node(sp);
20429 + spin_lock_irq_rcu_node(sp);
20430 gpseq = rcu_seq_current(&sp->srcu_gp_seq);
20431 if (!rcu_seq_state(gpseq) &&
20432 ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
20434 - raw_spin_unlock_irq_rcu_node(sp);
20435 + spin_unlock_irq_rcu_node(sp);
20436 /* Throttle expedited grace periods: Should be rare! */
20437 srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
20438 ? 0 : SRCU_INTERVAL);
20440 - raw_spin_unlock_irq_rcu_node(sp);
20441 + spin_unlock_irq_rcu_node(sp);
20445 @@ -590,18 +604,18 @@
20446 if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
20447 ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
20449 - raw_spin_lock_irqsave_rcu_node(snp, flags);
20450 + spin_lock_irqsave_rcu_node(snp, flags);
20451 if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
20452 - raw_spin_unlock_irqrestore_rcu_node(snp, flags);
20453 + spin_unlock_irqrestore_rcu_node(snp, flags);
20456 WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
20457 - raw_spin_unlock_irqrestore_rcu_node(snp, flags);
20458 + spin_unlock_irqrestore_rcu_node(snp, flags);
20460 - raw_spin_lock_irqsave_rcu_node(sp, flags);
20461 + spin_lock_irqsave_rcu_node(sp, flags);
20462 if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
20463 sp->srcu_gp_seq_needed_exp = s;
20464 - raw_spin_unlock_irqrestore_rcu_node(sp, flags);
20465 + spin_unlock_irqrestore_rcu_node(sp, flags);
20469 @@ -623,12 +637,12 @@
20470 for (; snp != NULL; snp = snp->srcu_parent) {
20471 if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
20472 return; /* GP already done and CBs recorded. */
20473 - raw_spin_lock_irqsave_rcu_node(snp, flags);
20474 + spin_lock_irqsave_rcu_node(snp, flags);
20475 if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
20476 snp_seq = snp->srcu_have_cbs[idx];
20477 if (snp == sdp->mynode && snp_seq == s)
20478 snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
20479 - raw_spin_unlock_irqrestore_rcu_node(snp, flags);
20480 + spin_unlock_irqrestore_rcu_node(snp, flags);
20481 if (snp == sdp->mynode && snp_seq != s) {
20482 srcu_schedule_cbs_sdp(sdp, do_norm
20484 @@ -644,11 +658,11 @@
20485 snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
20486 if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
20487 snp->srcu_gp_seq_needed_exp = s;
20488 - raw_spin_unlock_irqrestore_rcu_node(snp, flags);
20489 + spin_unlock_irqrestore_rcu_node(snp, flags);
20492 /* Top of tree, must ensure the grace period will be started. */
20493 - raw_spin_lock_irqsave_rcu_node(sp, flags);
20494 + spin_lock_irqsave_rcu_node(sp, flags);
20495 if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
20497 * Record need for grace period s. Pair with load
20498 @@ -667,7 +681,7 @@
20499 queue_delayed_work(system_power_efficient_wq, &sp->work,
20500 srcu_get_delay(sp));
20502 - raw_spin_unlock_irqrestore_rcu_node(sp, flags);
20503 + spin_unlock_irqrestore_rcu_node(sp, flags);
20507 @@ -736,6 +750,8 @@
20508 * negligible when amoritized over that time period, and the extra latency
20509 * of a needlessly non-expedited grace period is similarly negligible.
20511 +static DEFINE_LOCAL_IRQ_LOCK(sp_llock);
20513 static bool srcu_might_be_idle(struct srcu_struct *sp)
20515 unsigned long curseq;
20516 @@ -744,13 +760,13 @@
20519 /* If the local srcu_data structure has callbacks, not idle. */
20520 - local_irq_save(flags);
20521 + local_lock_irqsave(sp_llock, flags);
20522 sdp = this_cpu_ptr(sp->sda);
20523 if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
20524 - local_irq_restore(flags);
20525 + local_unlock_irqrestore(sp_llock, flags);
20526 return false; /* Callbacks already present, so not idle. */
20528 - local_irq_restore(flags);
20529 + local_unlock_irqrestore(sp_llock, flags);
20532 * No local callbacks, so probabalistically probe global state.
20533 @@ -828,9 +844,9 @@
20537 - local_irq_save(flags);
20538 + local_lock_irqsave(sp_llock, flags);
20539 sdp = this_cpu_ptr(sp->sda);
20540 - raw_spin_lock_rcu_node(sdp);
20541 + spin_lock_rcu_node(sdp);
20542 rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
20543 rcu_segcblist_advance(&sdp->srcu_cblist,
20544 rcu_seq_current(&sp->srcu_gp_seq));
20545 @@ -844,7 +860,8 @@
20546 sdp->srcu_gp_seq_needed_exp = s;
20549 - raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
20550 + spin_unlock_rcu_node(sdp);
20551 + local_unlock_irqrestore(sp_llock, flags);
20553 srcu_funnel_gp_start(sp, sdp, s, do_norm);
20555 @@ -900,7 +917,7 @@
20558 * Make sure that later code is ordered after the SRCU grace
20559 - * period. This pairs with the raw_spin_lock_irq_rcu_node()
20560 + * period. This pairs with the spin_lock_irq_rcu_node()
20561 * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed
20562 * because the current CPU might have been totally uninvolved with
20563 * (and thus unordered against) that grace period.
20564 @@ -1024,7 +1041,7 @@
20566 for_each_possible_cpu(cpu) {
20567 sdp = per_cpu_ptr(sp->sda, cpu);
20568 - raw_spin_lock_irq_rcu_node(sdp);
20569 + spin_lock_irq_rcu_node(sdp);
20570 atomic_inc(&sp->srcu_barrier_cpu_cnt);
20571 sdp->srcu_barrier_head.func = srcu_barrier_cb;
20572 debug_rcu_head_queue(&sdp->srcu_barrier_head);
20573 @@ -1033,7 +1050,7 @@
20574 debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
20575 atomic_dec(&sp->srcu_barrier_cpu_cnt);
20577 - raw_spin_unlock_irq_rcu_node(sdp);
20578 + spin_unlock_irq_rcu_node(sdp);
20581 /* Remove the initial count, at which point reaching zero can happen. */
20582 @@ -1082,17 +1099,17 @@
20584 idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
20585 if (idx == SRCU_STATE_IDLE) {
20586 - raw_spin_lock_irq_rcu_node(sp);
20587 + spin_lock_irq_rcu_node(sp);
20588 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
20589 WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
20590 - raw_spin_unlock_irq_rcu_node(sp);
20591 + spin_unlock_irq_rcu_node(sp);
20592 mutex_unlock(&sp->srcu_gp_mutex);
20595 idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
20596 if (idx == SRCU_STATE_IDLE)
20598 - raw_spin_unlock_irq_rcu_node(sp);
20599 + spin_unlock_irq_rcu_node(sp);
20600 if (idx != SRCU_STATE_IDLE) {
20601 mutex_unlock(&sp->srcu_gp_mutex);
20602 return; /* Someone else started the grace period. */
20603 @@ -1141,19 +1158,19 @@
20604 sdp = container_of(work, struct srcu_data, work.work);
20606 rcu_cblist_init(&ready_cbs);
20607 - raw_spin_lock_irq_rcu_node(sdp);
20608 + spin_lock_irq_rcu_node(sdp);
20609 rcu_segcblist_advance(&sdp->srcu_cblist,
20610 rcu_seq_current(&sp->srcu_gp_seq));
20611 if (sdp->srcu_cblist_invoking ||
20612 !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
20613 - raw_spin_unlock_irq_rcu_node(sdp);
20614 + spin_unlock_irq_rcu_node(sdp);
20615 return; /* Someone else on the job or nothing to do. */
20618 /* We are on the job! Extract and invoke ready callbacks. */
20619 sdp->srcu_cblist_invoking = true;
20620 rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
20621 - raw_spin_unlock_irq_rcu_node(sdp);
20622 + spin_unlock_irq_rcu_node(sdp);
20623 rhp = rcu_cblist_dequeue(&ready_cbs);
20624 for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
20625 debug_rcu_head_unqueue(rhp);
20626 @@ -1166,13 +1183,13 @@
20627 * Update counts, accelerate new callbacks, and if needed,
20628 * schedule another round of callback invocation.
20630 - raw_spin_lock_irq_rcu_node(sdp);
20631 + spin_lock_irq_rcu_node(sdp);
20632 rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
20633 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
20634 rcu_seq_snap(&sp->srcu_gp_seq));
20635 sdp->srcu_cblist_invoking = false;
20636 more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
20637 - raw_spin_unlock_irq_rcu_node(sdp);
20638 + spin_unlock_irq_rcu_node(sdp);
20640 srcu_schedule_cbs_sdp(sdp, 0);
20642 @@ -1185,7 +1202,7 @@
20644 bool pushgp = true;
20646 - raw_spin_lock_irq_rcu_node(sp);
20647 + spin_lock_irq_rcu_node(sp);
20648 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
20649 if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
20650 /* All requests fulfilled, time to go idle. */
20651 @@ -1195,7 +1212,7 @@
20652 /* Outstanding request and no GP. Start one. */
20655 - raw_spin_unlock_irq_rcu_node(sp);
20656 + spin_unlock_irq_rcu_node(sp);
20659 queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
20660 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/tree.c linux-4.14/kernel/rcu/tree.c
20661 --- linux-4.14.orig/kernel/rcu/tree.c 2017-11-12 19:46:13.000000000 +0100
20662 +++ linux-4.14/kernel/rcu/tree.c 2018-09-05 11:05:07.000000000 +0200
20664 #include <linux/trace_events.h>
20665 #include <linux/suspend.h>
20666 #include <linux/ftrace.h>
20667 +#include <linux/delay.h>
20668 +#include <linux/gfp.h>
20669 +#include <linux/oom.h>
20670 +#include <linux/smpboot.h>
20671 +#include "../time/tick-internal.h"
20675 @@ -243,6 +248,19 @@
20676 this_cpu_ptr(&rcu_sched_data), true);
20679 +#ifdef CONFIG_PREEMPT_RT_FULL
20680 +static void rcu_preempt_qs(void);
20682 +void rcu_bh_qs(void)
20684 + unsigned long flags;
20686 + /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
20687 + local_irq_save(flags);
20688 + rcu_preempt_qs();
20689 + local_irq_restore(flags);
20692 void rcu_bh_qs(void)
20694 RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!");
20695 @@ -253,6 +271,7 @@
20696 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
20702 * Steal a bit from the bottom of ->dynticks for idle entry/exit
20703 @@ -564,11 +583,13 @@
20705 * Return the number of RCU BH batches started thus far for debug & stats.
20707 +#ifndef CONFIG_PREEMPT_RT_FULL
20708 unsigned long rcu_batches_started_bh(void)
20710 return rcu_bh_state.gpnum;
20712 EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
20716 * Return the number of RCU batches completed thus far for debug & stats.
20717 @@ -588,6 +609,7 @@
20719 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
20721 +#ifndef CONFIG_PREEMPT_RT_FULL
20723 * Return the number of RCU BH batches completed thus far for debug & stats.
20725 @@ -596,6 +618,7 @@
20726 return rcu_bh_state.completed;
20728 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
20732 * Return the number of RCU expedited batches completed thus far for
20733 @@ -619,6 +642,7 @@
20735 EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
20737 +#ifndef CONFIG_PREEMPT_RT_FULL
20739 * Force a quiescent state.
20741 @@ -637,6 +661,13 @@
20743 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
20746 +void rcu_force_quiescent_state(void)
20749 +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
20753 * Force a quiescent state for RCU-sched.
20755 @@ -687,9 +718,11 @@
20759 +#ifndef CONFIG_PREEMPT_RT_FULL
20760 case RCU_BH_FLAVOR:
20761 rsp = &rcu_bh_state;
20764 case RCU_SCHED_FLAVOR:
20765 rsp = &rcu_sched_state;
20767 @@ -2918,18 +2951,17 @@
20769 * Do RCU core processing for the current CPU.
20771 -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
20772 +static __latent_entropy void rcu_process_callbacks(void)
20774 struct rcu_state *rsp;
20776 if (cpu_is_offline(smp_processor_id()))
20778 - trace_rcu_utilization(TPS("Start RCU core"));
20779 for_each_rcu_flavor(rsp)
20780 __rcu_process_callbacks(rsp);
20781 - trace_rcu_utilization(TPS("End RCU core"));
20784 +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
20786 * Schedule RCU callback invocation. If the specified type of RCU
20787 * does not support RCU priority boosting, just do a direct call,
20788 @@ -2941,18 +2973,105 @@
20790 if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
20792 - if (likely(!rsp->boost)) {
20793 - rcu_do_batch(rsp, rdp);
20794 + rcu_do_batch(rsp, rdp);
20797 +static void rcu_wake_cond(struct task_struct *t, int status)
20800 + * If the thread is yielding, only wake it when this
20801 + * is invoked from idle
20803 + if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
20804 + wake_up_process(t);
20808 + * Wake up this CPU's rcuc kthread to do RCU core processing.
20810 +static void invoke_rcu_core(void)
20812 + unsigned long flags;
20813 + struct task_struct *t;
20815 + if (!cpu_online(smp_processor_id()))
20817 + local_irq_save(flags);
20818 + __this_cpu_write(rcu_cpu_has_work, 1);
20819 + t = __this_cpu_read(rcu_cpu_kthread_task);
20820 + if (t != NULL && current != t)
20821 + rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
20822 + local_irq_restore(flags);
20825 +static void rcu_cpu_kthread_park(unsigned int cpu)
20827 + per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
20830 +static int rcu_cpu_kthread_should_run(unsigned int cpu)
20832 + return __this_cpu_read(rcu_cpu_has_work);
20836 + * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
20837 + * RCU softirq used in flavors and configurations of RCU that do not
20838 + * support RCU priority boosting.
20840 +static void rcu_cpu_kthread(unsigned int cpu)
20842 + unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
20843 + char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
20846 + for (spincnt = 0; spincnt < 10; spincnt++) {
20847 + trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
20848 + local_bh_disable();
20849 + *statusp = RCU_KTHREAD_RUNNING;
20850 + this_cpu_inc(rcu_cpu_kthread_loops);
20851 + local_irq_disable();
20854 + local_irq_enable();
20856 + rcu_process_callbacks();
20857 + local_bh_enable();
20858 + if (*workp == 0) {
20859 + trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
20860 + *statusp = RCU_KTHREAD_WAITING;
20864 - invoke_rcu_callbacks_kthread();
20865 + *statusp = RCU_KTHREAD_YIELDING;
20866 + trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
20867 + schedule_timeout_interruptible(2);
20868 + trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
20869 + *statusp = RCU_KTHREAD_WAITING;
20872 -static void invoke_rcu_core(void)
20873 +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
20874 + .store = &rcu_cpu_kthread_task,
20875 + .thread_should_run = rcu_cpu_kthread_should_run,
20876 + .thread_fn = rcu_cpu_kthread,
20877 + .thread_comm = "rcuc/%u",
20878 + .setup = rcu_cpu_kthread_setup,
20879 + .park = rcu_cpu_kthread_park,
20883 + * Spawn per-CPU RCU core processing kthreads.
20885 +static int __init rcu_spawn_core_kthreads(void)
20887 - if (cpu_online(smp_processor_id()))
20888 - raise_softirq(RCU_SOFTIRQ);
20891 + for_each_possible_cpu(cpu)
20892 + per_cpu(rcu_cpu_has_work, cpu) = 0;
20893 + BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
20896 +early_initcall(rcu_spawn_core_kthreads);
20899 * Handle any core-RCU processing required by a call_rcu() invocation.
20900 @@ -3113,6 +3232,7 @@
20902 EXPORT_SYMBOL_GPL(call_rcu_sched);
20904 +#ifndef CONFIG_PREEMPT_RT_FULL
20906 * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
20907 * @head: structure to be used for queueing the RCU updates.
20908 @@ -3140,6 +3260,7 @@
20909 __call_rcu(head, func, &rcu_bh_state, -1, 0);
20911 EXPORT_SYMBOL_GPL(call_rcu_bh);
20915 * Queue an RCU callback for lazy invocation after a grace period.
20916 @@ -3225,6 +3346,7 @@
20918 EXPORT_SYMBOL_GPL(synchronize_sched);
20920 +#ifndef CONFIG_PREEMPT_RT_FULL
20922 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
20924 @@ -3251,6 +3373,7 @@
20925 wait_rcu_gp(call_rcu_bh);
20927 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
20931 * get_state_synchronize_rcu - Snapshot current RCU state
20932 @@ -3601,6 +3724,7 @@
20933 mutex_unlock(&rsp->barrier_mutex);
20936 +#ifndef CONFIG_PREEMPT_RT_FULL
20938 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
20940 @@ -3609,6 +3733,7 @@
20941 _rcu_barrier(&rcu_bh_state);
20943 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
20947 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
20948 @@ -3741,8 +3866,6 @@
20950 sync_sched_exp_online_cleanup(cpu);
20951 rcutree_affinity_setting(cpu, -1);
20952 - if (IS_ENABLED(CONFIG_TREE_SRCU))
20953 - srcu_online_cpu(cpu);
20957 @@ -3753,8 +3876,6 @@
20958 int rcutree_offline_cpu(unsigned int cpu)
20960 rcutree_affinity_setting(cpu, cpu);
20961 - if (IS_ENABLED(CONFIG_TREE_SRCU))
20962 - srcu_offline_cpu(cpu);
20966 @@ -4184,12 +4305,13 @@
20968 rcu_bootup_announce();
20969 rcu_init_geometry();
20970 +#ifndef CONFIG_PREEMPT_RT_FULL
20971 rcu_init_one(&rcu_bh_state);
20973 rcu_init_one(&rcu_sched_state);
20975 rcu_dump_rcu_node_tree(&rcu_sched_state);
20976 __rcu_init_preempt();
20977 - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
20980 * We don't need protection against CPU-hotplug here because
20981 @@ -4200,8 +4322,6 @@
20982 for_each_online_cpu(cpu) {
20983 rcutree_prepare_cpu(cpu);
20984 rcu_cpu_starting(cpu);
20985 - if (IS_ENABLED(CONFIG_TREE_SRCU))
20986 - srcu_online_cpu(cpu);
20990 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/tree.h linux-4.14/kernel/rcu/tree.h
20991 --- linux-4.14.orig/kernel/rcu/tree.h 2017-11-12 19:46:13.000000000 +0100
20992 +++ linux-4.14/kernel/rcu/tree.h 2018-09-05 11:05:07.000000000 +0200
20993 @@ -427,7 +427,9 @@
20995 extern struct rcu_state rcu_sched_state;
20997 +#ifndef CONFIG_PREEMPT_RT_FULL
20998 extern struct rcu_state rcu_bh_state;
21001 #ifdef CONFIG_PREEMPT_RCU
21002 extern struct rcu_state rcu_preempt_state;
21003 @@ -436,12 +438,10 @@
21004 int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
21005 bool rcu_eqs_special_set(int cpu);
21007 -#ifdef CONFIG_RCU_BOOST
21008 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21009 DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
21010 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21011 DECLARE_PER_CPU(char, rcu_cpu_has_work);
21012 -#endif /* #ifdef CONFIG_RCU_BOOST */
21014 #ifndef RCU_TREE_NONCORE
21016 @@ -461,10 +461,9 @@
21017 static void __init __rcu_init_preempt(void);
21018 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
21019 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
21020 -static void invoke_rcu_callbacks_kthread(void);
21021 static bool rcu_is_callbacks_kthread(void);
21022 +static void rcu_cpu_kthread_setup(unsigned int cpu);
21023 #ifdef CONFIG_RCU_BOOST
21024 -static void rcu_preempt_do_callbacks(void);
21025 static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
21026 struct rcu_node *rnp);
21027 #endif /* #ifdef CONFIG_RCU_BOOST */
21028 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/tree_plugin.h linux-4.14/kernel/rcu/tree_plugin.h
21029 --- linux-4.14.orig/kernel/rcu/tree_plugin.h 2018-09-05 11:03:22.000000000 +0200
21030 +++ linux-4.14/kernel/rcu/tree_plugin.h 2018-09-05 11:05:07.000000000 +0200
21031 @@ -24,39 +24,16 @@
21032 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21035 -#include <linux/delay.h>
21036 -#include <linux/gfp.h>
21037 -#include <linux/oom.h>
21038 -#include <linux/sched/debug.h>
21039 -#include <linux/smpboot.h>
21040 -#include <uapi/linux/sched/types.h>
21041 -#include "../time/tick-internal.h"
21043 -#ifdef CONFIG_RCU_BOOST
21045 #include "../locking/rtmutex_common.h"
21048 * Control variables for per-CPU and per-rcu_node kthreads. These
21049 * handle all flavors of RCU.
21051 -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
21052 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21053 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21054 DEFINE_PER_CPU(char, rcu_cpu_has_work);
21056 -#else /* #ifdef CONFIG_RCU_BOOST */
21059 - * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
21060 - * all uses are in dead code. Provide a definition to keep the compiler
21061 - * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
21062 - * This probably needs to be excluded from -rt builds.
21064 -#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
21066 -#endif /* #else #ifdef CONFIG_RCU_BOOST */
21068 #ifdef CONFIG_RCU_NOCB_CPU
21069 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
21070 static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
21071 @@ -324,9 +301,13 @@
21072 struct task_struct *t = current;
21073 struct rcu_data *rdp;
21074 struct rcu_node *rnp;
21075 + int sleeping_l = 0;
21077 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n");
21078 - WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
21079 +#if defined(CONFIG_PREEMPT_RT_FULL)
21080 + sleeping_l = t->sleeping_lock;
21082 + WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0 && !sleeping_l);
21083 if (t->rcu_read_lock_nesting > 0 &&
21084 !t->rcu_read_unlock_special.b.blocked) {
21086 @@ -463,7 +444,7 @@
21089 /* Hardware IRQ handlers cannot block, complain if they get here. */
21090 - if (in_irq() || in_serving_softirq()) {
21091 + if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
21092 lockdep_rcu_suspicious(__FILE__, __LINE__,
21093 "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
21094 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
21095 @@ -530,7 +511,7 @@
21097 /* Unboost if we were boosted. */
21098 if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
21099 - rt_mutex_unlock(&rnp->boost_mtx);
21100 + rt_mutex_futex_unlock(&rnp->boost_mtx);
21103 * If this was the last task on the expedited lists,
21104 @@ -684,15 +665,6 @@
21105 t->rcu_read_unlock_special.b.need_qs = true;
21108 -#ifdef CONFIG_RCU_BOOST
21110 -static void rcu_preempt_do_callbacks(void)
21112 - rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
21115 -#endif /* #ifdef CONFIG_RCU_BOOST */
21118 * call_rcu() - Queue an RCU callback for invocation after a grace period.
21119 * @head: structure to be used for queueing the RCU updates.
21120 @@ -915,20 +887,23 @@
21122 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
21125 + * If boosting, set rcuc kthreads to realtime priority.
21127 +static void rcu_cpu_kthread_setup(unsigned int cpu)
21129 #ifdef CONFIG_RCU_BOOST
21130 + struct sched_param sp;
21132 -#include "../locking/rtmutex_common.h"
21134 -static void rcu_wake_cond(struct task_struct *t, int status)
21137 - * If the thread is yielding, only wake it when this
21138 - * is invoked from idle
21140 - if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
21141 - wake_up_process(t);
21142 + sp.sched_priority = kthread_prio;
21143 + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
21144 +#endif /* #ifdef CONFIG_RCU_BOOST */
21147 +#ifdef CONFIG_RCU_BOOST
21149 +#include "../locking/rtmutex_common.h"
21152 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
21153 * or ->boost_tasks, advancing the pointer to the next task in the
21154 @@ -1071,23 +1046,6 @@
21158 - * Wake up the per-CPU kthread to invoke RCU callbacks.
21160 -static void invoke_rcu_callbacks_kthread(void)
21162 - unsigned long flags;
21164 - local_irq_save(flags);
21165 - __this_cpu_write(rcu_cpu_has_work, 1);
21166 - if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
21167 - current != __this_cpu_read(rcu_cpu_kthread_task)) {
21168 - rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
21169 - __this_cpu_read(rcu_cpu_kthread_status));
21171 - local_irq_restore(flags);
21175 * Is the current CPU running the RCU-callbacks kthread?
21176 * Caller must have preemption disabled.
21178 @@ -1141,67 +1099,6 @@
21182 -static void rcu_kthread_do_work(void)
21184 - rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
21185 - rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
21186 - rcu_preempt_do_callbacks();
21189 -static void rcu_cpu_kthread_setup(unsigned int cpu)
21191 - struct sched_param sp;
21193 - sp.sched_priority = kthread_prio;
21194 - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
21197 -static void rcu_cpu_kthread_park(unsigned int cpu)
21199 - per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
21202 -static int rcu_cpu_kthread_should_run(unsigned int cpu)
21204 - return __this_cpu_read(rcu_cpu_has_work);
21208 - * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
21209 - * RCU softirq used in flavors and configurations of RCU that do not
21210 - * support RCU priority boosting.
21212 -static void rcu_cpu_kthread(unsigned int cpu)
21214 - unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
21215 - char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
21218 - for (spincnt = 0; spincnt < 10; spincnt++) {
21219 - trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
21220 - local_bh_disable();
21221 - *statusp = RCU_KTHREAD_RUNNING;
21222 - this_cpu_inc(rcu_cpu_kthread_loops);
21223 - local_irq_disable();
21226 - local_irq_enable();
21228 - rcu_kthread_do_work();
21229 - local_bh_enable();
21230 - if (*workp == 0) {
21231 - trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
21232 - *statusp = RCU_KTHREAD_WAITING;
21236 - *statusp = RCU_KTHREAD_YIELDING;
21237 - trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
21238 - schedule_timeout_interruptible(2);
21239 - trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
21240 - *statusp = RCU_KTHREAD_WAITING;
21244 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
21245 * served by the rcu_node in question. The CPU hotplug lock is still
21246 @@ -1232,26 +1129,12 @@
21247 free_cpumask_var(cm);
21250 -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
21251 - .store = &rcu_cpu_kthread_task,
21252 - .thread_should_run = rcu_cpu_kthread_should_run,
21253 - .thread_fn = rcu_cpu_kthread,
21254 - .thread_comm = "rcuc/%u",
21255 - .setup = rcu_cpu_kthread_setup,
21256 - .park = rcu_cpu_kthread_park,
21260 * Spawn boost kthreads -- called as soon as the scheduler is running.
21262 static void __init rcu_spawn_boost_kthreads(void)
21264 struct rcu_node *rnp;
21267 - for_each_possible_cpu(cpu)
21268 - per_cpu(rcu_cpu_has_work, cpu) = 0;
21269 - BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
21270 rcu_for_each_leaf_node(rcu_state_p, rnp)
21271 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
21273 @@ -1274,11 +1157,6 @@
21274 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
21277 -static void invoke_rcu_callbacks_kthread(void)
21282 static bool rcu_is_callbacks_kthread(void)
21285 @@ -1302,7 +1180,7 @@
21287 #endif /* #else #ifdef CONFIG_RCU_BOOST */
21289 -#if !defined(CONFIG_RCU_FAST_NO_HZ)
21290 +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
21293 * Check to see if any future RCU-related work will need to be done
21294 @@ -1318,7 +1196,9 @@
21295 *nextevt = KTIME_MAX;
21296 return rcu_cpu_has_callbacks(NULL);
21298 +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
21300 +#if !defined(CONFIG_RCU_FAST_NO_HZ)
21302 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
21304 @@ -1414,6 +1294,8 @@
21308 +#ifndef CONFIG_PREEMPT_RT_FULL
21311 * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
21312 * to invoke. If the CPU has callbacks, try to advance them. Tell the
21313 @@ -1456,6 +1338,7 @@
21314 *nextevt = basemono + dj * TICK_NSEC;
21317 +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
21320 * Prepare a CPU for idle from an RCU perspective. The first major task
21321 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/update.c linux-4.14/kernel/rcu/update.c
21322 --- linux-4.14.orig/kernel/rcu/update.c 2018-09-05 11:03:22.000000000 +0200
21323 +++ linux-4.14/kernel/rcu/update.c 2018-09-05 11:05:07.000000000 +0200
21325 module_param(rcu_expedited, int, 0);
21326 extern int rcu_normal; /* from sysctl */
21327 module_param(rcu_normal, int, 0);
21328 -static int rcu_normal_after_boot;
21329 +static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
21330 module_param(rcu_normal_after_boot, int, 0);
21331 #endif /* #ifndef CONFIG_TINY_RCU */
21333 @@ -333,6 +333,7 @@
21335 EXPORT_SYMBOL_GPL(rcu_read_lock_held);
21337 +#ifndef CONFIG_PREEMPT_RT_FULL
21339 * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
21341 @@ -359,6 +360,7 @@
21342 return in_softirq() || irqs_disabled();
21344 EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
21347 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
21349 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/completion.c linux-4.14/kernel/sched/completion.c
21350 --- linux-4.14.orig/kernel/sched/completion.c 2017-11-12 19:46:13.000000000 +0100
21351 +++ linux-4.14/kernel/sched/completion.c 2018-09-05 11:05:07.000000000 +0200
21354 unsigned long flags;
21356 - spin_lock_irqsave(&x->wait.lock, flags);
21357 + raw_spin_lock_irqsave(&x->wait.lock, flags);
21360 * Perform commit of crossrelease here.
21363 if (x->done != UINT_MAX)
21365 - __wake_up_locked(&x->wait, TASK_NORMAL, 1);
21366 - spin_unlock_irqrestore(&x->wait.lock, flags);
21367 + swake_up_locked(&x->wait);
21368 + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21370 EXPORT_SYMBOL(complete);
21372 @@ -66,10 +66,10 @@
21374 unsigned long flags;
21376 - spin_lock_irqsave(&x->wait.lock, flags);
21377 + raw_spin_lock_irqsave(&x->wait.lock, flags);
21378 x->done = UINT_MAX;
21379 - __wake_up_locked(&x->wait, TASK_NORMAL, 0);
21380 - spin_unlock_irqrestore(&x->wait.lock, flags);
21381 + swake_up_all_locked(&x->wait);
21382 + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21384 EXPORT_SYMBOL(complete_all);
21386 @@ -78,20 +78,20 @@
21387 long (*action)(long), long timeout, int state)
21390 - DECLARE_WAITQUEUE(wait, current);
21391 + DECLARE_SWAITQUEUE(wait);
21393 - __add_wait_queue_entry_tail_exclusive(&x->wait, &wait);
21394 + __prepare_to_swait(&x->wait, &wait);
21396 if (signal_pending_state(state, current)) {
21397 timeout = -ERESTARTSYS;
21400 __set_current_state(state);
21401 - spin_unlock_irq(&x->wait.lock);
21402 + raw_spin_unlock_irq(&x->wait.lock);
21403 timeout = action(timeout);
21404 - spin_lock_irq(&x->wait.lock);
21405 + raw_spin_lock_irq(&x->wait.lock);
21406 } while (!x->done && timeout);
21407 - __remove_wait_queue(&x->wait, &wait);
21408 + __finish_swait(&x->wait, &wait);
21412 @@ -108,9 +108,9 @@
21414 complete_acquire(x);
21416 - spin_lock_irq(&x->wait.lock);
21417 + raw_spin_lock_irq(&x->wait.lock);
21418 timeout = do_wait_for_common(x, action, timeout, state);
21419 - spin_unlock_irq(&x->wait.lock);
21420 + raw_spin_unlock_irq(&x->wait.lock);
21422 complete_release(x);
21424 @@ -299,12 +299,12 @@
21425 if (!READ_ONCE(x->done))
21428 - spin_lock_irqsave(&x->wait.lock, flags);
21429 + raw_spin_lock_irqsave(&x->wait.lock, flags);
21432 else if (x->done != UINT_MAX)
21434 - spin_unlock_irqrestore(&x->wait.lock, flags);
21435 + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21438 EXPORT_SYMBOL(try_wait_for_completion);
21439 @@ -330,8 +330,8 @@
21440 * otherwise we can end up freeing the completion before complete()
21441 * is done referencing it.
21443 - spin_lock_irqsave(&x->wait.lock, flags);
21444 - spin_unlock_irqrestore(&x->wait.lock, flags);
21445 + raw_spin_lock_irqsave(&x->wait.lock, flags);
21446 + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21449 EXPORT_SYMBOL(completion_done);
21450 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/core.c linux-4.14/kernel/sched/core.c
21451 --- linux-4.14.orig/kernel/sched/core.c 2018-09-05 11:03:22.000000000 +0200
21452 +++ linux-4.14/kernel/sched/core.c 2018-09-05 11:05:07.000000000 +0200
21454 * Number of tasks to iterate in a single balance run.
21455 * Limited because this is done with IRQs disabled.
21457 +#ifndef CONFIG_PREEMPT_RT_FULL
21458 const_debug unsigned int sysctl_sched_nr_migrate = 32;
21460 +const_debug unsigned int sysctl_sched_nr_migrate = 8;
21464 * period over which we average the RT time consumption, measured
21465 @@ -341,7 +345,7 @@
21466 rq->hrtick_csd.info = rq;
21469 - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
21470 + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
21471 rq->hrtick_timer.function = hrtick;
21473 #else /* CONFIG_SCHED_HRTICK */
21474 @@ -423,9 +427,15 @@
21478 -void wake_q_add(struct wake_q_head *head, struct task_struct *task)
21479 +void __wake_q_add(struct wake_q_head *head, struct task_struct *task,
21482 - struct wake_q_node *node = &task->wake_q;
21483 + struct wake_q_node *node;
21486 + node = &task->wake_q_sleeper;
21488 + node = &task->wake_q;
21491 * Atomically grab the task, if ->wake_q is !nil already it means
21492 @@ -447,24 +457,32 @@
21493 head->lastp = &node->next;
21496 -void wake_up_q(struct wake_q_head *head)
21497 +void __wake_up_q(struct wake_q_head *head, bool sleeper)
21499 struct wake_q_node *node = head->first;
21501 while (node != WAKE_Q_TAIL) {
21502 struct task_struct *task;
21504 - task = container_of(node, struct task_struct, wake_q);
21506 + task = container_of(node, struct task_struct, wake_q_sleeper);
21508 + task = container_of(node, struct task_struct, wake_q);
21510 /* Task can safely be re-inserted now: */
21512 - task->wake_q.next = NULL;
21515 + task->wake_q_sleeper.next = NULL;
21517 + task->wake_q.next = NULL;
21519 * wake_up_process() implies a wmb() to pair with the queueing
21520 * in wake_q_add() so as not to miss wakeups.
21522 - wake_up_process(task);
21524 + wake_up_lock_sleeper(task);
21526 + wake_up_process(task);
21527 put_task_struct(task);
21530 @@ -500,6 +518,48 @@
21531 trace_sched_wake_idle_without_ipi(cpu);
21534 +#ifdef CONFIG_PREEMPT_LAZY
21536 +static int tsk_is_polling(struct task_struct *p)
21538 +#ifdef TIF_POLLING_NRFLAG
21539 + return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
21545 +void resched_curr_lazy(struct rq *rq)
21547 + struct task_struct *curr = rq->curr;
21550 + if (!sched_feat(PREEMPT_LAZY)) {
21551 + resched_curr(rq);
21555 + lockdep_assert_held(&rq->lock);
21557 + if (test_tsk_need_resched(curr))
21560 + if (test_tsk_need_resched_lazy(curr))
21563 + set_tsk_need_resched_lazy(curr);
21565 + cpu = cpu_of(rq);
21566 + if (cpu == smp_processor_id())
21569 + /* NEED_RESCHED_LAZY must be visible before we test polling */
21571 + if (!tsk_is_polling(curr))
21572 + smp_send_reschedule(cpu);
21576 void resched_cpu(int cpu)
21578 struct rq *rq = cpu_rq(cpu);
21579 @@ -523,11 +583,14 @@
21581 int get_nohz_timer_target(void)
21583 - int i, cpu = smp_processor_id();
21585 struct sched_domain *sd;
21587 + preempt_disable_rt();
21588 + cpu = smp_processor_id();
21590 if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
21592 + goto preempt_en_rt;
21595 for_each_domain(cpu, sd) {
21596 @@ -546,6 +609,8 @@
21597 cpu = housekeeping_any_cpu();
21601 + preempt_enable_rt();
21605 @@ -912,7 +977,7 @@
21607 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
21609 - if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
21610 + if (!cpumask_test_cpu(cpu, p->cpus_ptr))
21613 if (is_per_cpu_kthread(p))
21614 @@ -1007,7 +1072,7 @@
21615 local_irq_disable();
21617 * We need to explicitly wake pending tasks before running
21618 - * __migrate_task() such that we will not miss enforcing cpus_allowed
21619 + * __migrate_task() such that we will not miss enforcing cpus_ptr
21620 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
21622 sched_ttwu_pending();
21623 @@ -1038,11 +1103,19 @@
21625 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
21627 - cpumask_copy(&p->cpus_allowed, new_mask);
21628 + cpumask_copy(&p->cpus_mask, new_mask);
21629 p->nr_cpus_allowed = cpumask_weight(new_mask);
21632 -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
21633 +#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
21634 +int __migrate_disabled(struct task_struct *p)
21636 + return p->migrate_disable;
21640 +static void __do_set_cpus_allowed_tail(struct task_struct *p,
21641 + const struct cpumask *new_mask)
21643 struct rq *rq = task_rq(p);
21644 bool queued, running;
21645 @@ -1071,6 +1144,20 @@
21646 set_curr_task(rq, p);
21649 +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
21651 +#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
21652 + if (__migrate_disabled(p)) {
21653 + lockdep_assert_held(&p->pi_lock);
21655 + cpumask_copy(&p->cpus_mask, new_mask);
21656 + p->migrate_disable_update = 1;
21660 + __do_set_cpus_allowed_tail(p, new_mask);
21664 * Change a given task's CPU affinity. Migrate the thread to a
21665 * proper CPU and schedule it away if the CPU it's executing on
21666 @@ -1108,7 +1195,7 @@
21670 - if (cpumask_equal(&p->cpus_allowed, new_mask))
21671 + if (cpumask_equal(p->cpus_ptr, new_mask))
21674 if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
21675 @@ -1129,9 +1216,16 @@
21678 /* Can the task run on the task's current CPU? If so, we're done */
21679 - if (cpumask_test_cpu(task_cpu(p), new_mask))
21680 + if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
21683 +#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
21684 + if (__migrate_disabled(p)) {
21685 + p->migrate_disable_update = 1;
21690 dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
21691 if (task_running(rq, p) || p->state == TASK_WAKING) {
21692 struct migration_arg arg = { p, dest_cpu };
21693 @@ -1269,10 +1363,10 @@
21694 if (task_cpu(arg->src_task) != arg->src_cpu)
21697 - if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
21698 + if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
21701 - if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
21702 + if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
21705 __migrate_swap_task(arg->src_task, arg->dst_cpu);
21706 @@ -1313,10 +1407,10 @@
21707 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
21710 - if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
21711 + if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
21714 - if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
21715 + if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
21718 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
21719 @@ -1326,6 +1420,18 @@
21723 +static bool check_task_state(struct task_struct *p, long match_state)
21725 + bool match = false;
21727 + raw_spin_lock_irq(&p->pi_lock);
21728 + if (p->state == match_state || p->saved_state == match_state)
21730 + raw_spin_unlock_irq(&p->pi_lock);
21736 * wait_task_inactive - wait for a thread to unschedule.
21738 @@ -1370,7 +1476,7 @@
21739 * is actually now running somewhere else!
21741 while (task_running(rq, p)) {
21742 - if (match_state && unlikely(p->state != match_state))
21743 + if (match_state && !check_task_state(p, match_state))
21747 @@ -1385,7 +1491,8 @@
21748 running = task_running(rq, p);
21749 queued = task_on_rq_queued(p);
21751 - if (!match_state || p->state == match_state)
21752 + if (!match_state || p->state == match_state ||
21753 + p->saved_state == match_state)
21754 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
21755 task_rq_unlock(rq, p, &rf);
21757 @@ -1460,7 +1567,7 @@
21758 EXPORT_SYMBOL_GPL(kick_process);
21761 - * ->cpus_allowed is protected by both rq->lock and p->pi_lock
21762 + * ->cpus_ptr is protected by both rq->lock and p->pi_lock
21764 * A few notes on cpu_active vs cpu_online:
21766 @@ -1500,14 +1607,14 @@
21767 for_each_cpu(dest_cpu, nodemask) {
21768 if (!cpu_active(dest_cpu))
21770 - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
21771 + if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
21777 /* Any allowed, online CPU? */
21778 - for_each_cpu(dest_cpu, &p->cpus_allowed) {
21779 + for_each_cpu(dest_cpu, p->cpus_ptr) {
21780 if (!is_cpu_allowed(p, dest_cpu))
21783 @@ -1551,7 +1658,7 @@
21787 - * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
21788 + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
21791 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
21792 @@ -1561,11 +1668,11 @@
21793 if (p->nr_cpus_allowed > 1)
21794 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
21796 - cpu = cpumask_any(&p->cpus_allowed);
21797 + cpu = cpumask_any(p->cpus_ptr);
21800 * In order not to call set_task_cpu() on a blocking task we need
21801 - * to rely on ttwu() to place the task on a valid ->cpus_allowed
21802 + * to rely on ttwu() to place the task on a valid ->cpus_ptr
21805 * Since this is common to all placement strategies, this lives here.
21806 @@ -1668,10 +1775,6 @@
21808 activate_task(rq, p, en_flags);
21809 p->on_rq = TASK_ON_RQ_QUEUED;
21811 - /* If a worker is waking up, notify the workqueue: */
21812 - if (p->flags & PF_WQ_WORKER)
21813 - wq_worker_waking_up(p, cpu_of(rq));
21817 @@ -1995,8 +2098,27 @@
21819 raw_spin_lock_irqsave(&p->pi_lock, flags);
21820 smp_mb__after_spinlock();
21821 - if (!(p->state & state))
21822 + if (!(p->state & state)) {
21824 + * The task might be running due to a spinlock sleeper
21825 + * wakeup. Check the saved state and set it to running
21826 + * if the wakeup condition is true.
21828 + if (!(wake_flags & WF_LOCK_SLEEPER)) {
21829 + if (p->saved_state & state) {
21830 + p->saved_state = TASK_RUNNING;
21838 + * If this is a regular wakeup, then we can unconditionally
21839 + * clear the saved state of a "lock sleeper".
21841 + if (!(wake_flags & WF_LOCK_SLEEPER))
21842 + p->saved_state = TASK_RUNNING;
21844 trace_sched_waking(p);
21846 @@ -2093,56 +2215,6 @@
21850 - * try_to_wake_up_local - try to wake up a local task with rq lock held
21851 - * @p: the thread to be awakened
21852 - * @rf: request-queue flags for pinning
21854 - * Put @p on the run-queue if it's not already there. The caller must
21855 - * ensure that this_rq() is locked, @p is bound to this_rq() and not
21856 - * the current task.
21858 -static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
21860 - struct rq *rq = task_rq(p);
21862 - if (WARN_ON_ONCE(rq != this_rq()) ||
21863 - WARN_ON_ONCE(p == current))
21866 - lockdep_assert_held(&rq->lock);
21868 - if (!raw_spin_trylock(&p->pi_lock)) {
21870 - * This is OK, because current is on_cpu, which avoids it being
21871 - * picked for load-balance and preemption/IRQs are still
21872 - * disabled avoiding further scheduler activity on it and we've
21873 - * not yet picked a replacement task.
21875 - rq_unlock(rq, rf);
21876 - raw_spin_lock(&p->pi_lock);
21877 - rq_relock(rq, rf);
21880 - if (!(p->state & TASK_NORMAL))
21883 - trace_sched_waking(p);
21885 - if (!task_on_rq_queued(p)) {
21886 - if (p->in_iowait) {
21887 - delayacct_blkio_end(p);
21888 - atomic_dec(&rq->nr_iowait);
21890 - ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
21893 - ttwu_do_wakeup(rq, p, 0, rf);
21894 - ttwu_stat(p, smp_processor_id(), 0);
21896 - raw_spin_unlock(&p->pi_lock);
21900 * wake_up_process - Wake up a specific process
21901 * @p: The process to be woken up.
21903 @@ -2160,6 +2232,18 @@
21905 EXPORT_SYMBOL(wake_up_process);
21908 + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
21909 + * @p: The process to be woken up.
21911 + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
21912 + * the nature of the wakeup.
21914 +int wake_up_lock_sleeper(struct task_struct *p)
21916 + return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER);
21919 int wake_up_state(struct task_struct *p, unsigned int state)
21921 return try_to_wake_up(p, state, 0);
21922 @@ -2420,6 +2504,9 @@
21925 init_task_preempt_count(p);
21926 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
21927 + task_thread_info(p)->preempt_lazy_count = 0;
21930 plist_node_init(&p->pushable_tasks, MAX_PRIO);
21931 RB_CLEAR_NODE(&p->pushable_dl_tasks);
21932 @@ -2462,7 +2549,7 @@
21935 * Fork balancing, do it here and not earlier because:
21936 - * - cpus_allowed can change in the fork path
21937 + * - cpus_ptr can change in the fork path
21938 * - any previously selected CPU might disappear through hotplug
21940 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
21941 @@ -2675,21 +2762,16 @@
21942 finish_arch_post_lock_switch();
21944 fire_sched_in_preempt_notifiers(current);
21946 + * We use mmdrop_delayed() here so we don't have to do the
21947 + * full __mmdrop() when we are the last user.
21951 + mmdrop_delayed(mm);
21952 if (unlikely(prev_state == TASK_DEAD)) {
21953 if (prev->sched_class->task_dead)
21954 prev->sched_class->task_dead(prev);
21957 - * Remove function-return probe instances associated with this
21958 - * task and put them back on the free list.
21960 - kprobe_flush_task(prev);
21962 - /* Task is done with its stack. */
21963 - put_task_stack(prev);
21965 put_task_struct(prev);
21968 @@ -3336,25 +3418,13 @@
21969 atomic_inc(&rq->nr_iowait);
21970 delayacct_blkio_start();
21974 - * If a worker went to sleep, notify and ask workqueue
21975 - * whether it wants to wake up a task to maintain
21978 - if (prev->flags & PF_WQ_WORKER) {
21979 - struct task_struct *to_wakeup;
21981 - to_wakeup = wq_worker_sleeping(prev);
21983 - try_to_wake_up_local(to_wakeup, &rf);
21986 switch_count = &prev->nvcsw;
21989 next = pick_next_task(rq, prev, &rf);
21990 clear_tsk_need_resched(prev);
21991 + clear_tsk_need_resched_lazy(prev);
21992 clear_preempt_need_resched();
21994 if (likely(prev != next)) {
21995 @@ -3407,8 +3477,19 @@
21997 static inline void sched_submit_work(struct task_struct *tsk)
21999 - if (!tsk->state || tsk_is_pi_blocked(tsk))
22003 + * If a worker went to sleep, notify and ask workqueue whether
22004 + * it wants to wake up a task to maintain concurrency.
22006 + if (tsk->flags & PF_WQ_WORKER)
22007 + wq_worker_sleeping(tsk);
22010 + if (tsk_is_pi_blocked(tsk))
22014 * If we are going to sleep and we have plugged IO queued,
22015 * make sure to submit it to avoid deadlocks.
22016 @@ -3417,6 +3498,12 @@
22017 blk_schedule_flush_plug(tsk);
22020 +static void sched_update_worker(struct task_struct *tsk)
22022 + if (tsk->flags & PF_WQ_WORKER)
22023 + wq_worker_running(tsk);
22026 asmlinkage __visible void __sched schedule(void)
22028 struct task_struct *tsk = current;
22029 @@ -3427,6 +3514,7 @@
22031 sched_preempt_enable_no_resched();
22032 } while (need_resched());
22033 + sched_update_worker(tsk);
22035 EXPORT_SYMBOL(schedule);
22037 @@ -3515,6 +3603,30 @@
22038 } while (need_resched());
22041 +#ifdef CONFIG_PREEMPT_LAZY
22043 + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
22044 + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
22045 + * preempt_lazy_count counter >0.
22047 +static __always_inline int preemptible_lazy(void)
22049 + if (test_thread_flag(TIF_NEED_RESCHED))
22051 + if (current_thread_info()->preempt_lazy_count)
22058 +static inline int preemptible_lazy(void)
22065 #ifdef CONFIG_PREEMPT
22067 * this is the entry point to schedule() from in-kernel preemption
22068 @@ -3529,7 +3641,8 @@
22070 if (likely(!preemptible()))
22073 + if (!preemptible_lazy())
22075 preempt_schedule_common();
22077 NOKPROBE_SYMBOL(preempt_schedule);
22078 @@ -3556,6 +3669,9 @@
22079 if (likely(!preemptible()))
22082 + if (!preemptible_lazy())
22087 * Because the function tracer can trace preempt_count_sub()
22088 @@ -3578,7 +3694,16 @@
22089 * an infinite recursion.
22091 prev_ctx = exception_enter();
22093 + * The add/subtract must not be traced by the function
22094 + * tracer. But we still want to account for the
22095 + * preempt off latency tracer. Since the _notrace versions
22096 + * of add/subtract skip the accounting for latency tracer
22097 + * we must force it manually.
22099 + start_critical_timings();
22101 + stop_critical_timings();
22102 exception_exit(prev_ctx);
22104 preempt_latency_stop(1);
22105 @@ -4164,7 +4289,7 @@
22106 * the entire root_domain to become SCHED_DEADLINE. We
22107 * will also fail if there's no bandwidth available.
22109 - if (!cpumask_subset(span, &p->cpus_allowed) ||
22110 + if (!cpumask_subset(span, p->cpus_ptr) ||
22111 rq->rd->dl_bw.bw == 0) {
22112 task_rq_unlock(rq, p, &rf);
22114 @@ -4758,7 +4883,7 @@
22117 raw_spin_lock_irqsave(&p->pi_lock, flags);
22118 - cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
22119 + cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
22120 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
22123 @@ -4877,6 +5002,7 @@
22125 EXPORT_SYMBOL(__cond_resched_lock);
22127 +#ifndef CONFIG_PREEMPT_RT_FULL
22128 int __sched __cond_resched_softirq(void)
22130 BUG_ON(!in_softirq());
22131 @@ -4890,6 +5016,7 @@
22134 EXPORT_SYMBOL(__cond_resched_softirq);
22138 * yield - yield the current processor to other threads.
22139 @@ -5284,7 +5411,9 @@
22141 /* Set the preempt count _outside_ the spinlocks! */
22142 init_idle_preempt_count(idle, cpu);
22144 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
22145 + task_thread_info(idle)->preempt_lazy_count = 0;
22148 * The idle tasks have their own, simple scheduling class:
22150 @@ -5323,7 +5452,7 @@
22151 * allowed nodes is unnecessary. Thus, cpusets are not
22152 * applicable for such threads. This prevents checking for
22153 * success of set_cpus_allowed_ptr() on all attached tasks
22154 - * before cpus_allowed may be changed.
22155 + * before cpus_mask may be changed.
22157 if (p->flags & PF_NO_SETAFFINITY) {
22159 @@ -5350,7 +5479,7 @@
22160 if (curr_cpu == target_cpu)
22163 - if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
22164 + if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
22167 /* TODO: This is not properly updating schedstats */
22168 @@ -5389,6 +5518,8 @@
22169 #endif /* CONFIG_NUMA_BALANCING */
22171 #ifdef CONFIG_HOTPLUG_CPU
22172 +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
22175 * Ensure that the idle task is using init_mm right before its CPU goes
22177 @@ -5403,7 +5534,12 @@
22178 switch_mm(mm, &init_mm, current);
22179 finish_arch_post_lock_switch();
22183 + * Defer the cleanup to an alive cpu. On RT we can neither
22184 + * call mmdrop() nor mmdrop_delayed() from here.
22186 + per_cpu(idle_last_mm, smp_processor_id()) = mm;
22191 @@ -5487,7 +5623,7 @@
22192 put_prev_task(rq, next);
22195 - * Rules for changing task_struct::cpus_allowed are holding
22196 + * Rules for changing task_struct::cpus_mask are holding
22197 * both pi_lock and rq->lock, such that holding either
22198 * stabilizes the mask.
22200 @@ -5718,6 +5854,10 @@
22201 update_max_interval();
22202 nohz_balance_exit_idle(cpu);
22204 + if (per_cpu(idle_last_mm, cpu)) {
22205 + mmdrop_delayed(per_cpu(idle_last_mm, cpu));
22206 + per_cpu(idle_last_mm, cpu) = NULL;
22211 @@ -5964,7 +6104,7 @@
22212 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
22213 static inline int preempt_count_equals(int preempt_offset)
22215 - int nested = preempt_count() + rcu_preempt_depth();
22216 + int nested = preempt_count() + sched_rcu_preempt_depth();
22218 return (nested == preempt_offset);
22220 @@ -6756,3 +6896,197 @@
22221 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
22222 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
22225 +#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
22227 +static inline void
22228 +update_nr_migratory(struct task_struct *p, long delta)
22230 + if (unlikely((p->sched_class == &rt_sched_class ||
22231 + p->sched_class == &dl_sched_class) &&
22232 + p->nr_cpus_allowed > 1)) {
22233 + if (p->sched_class == &rt_sched_class)
22234 + task_rq(p)->rt.rt_nr_migratory += delta;
22236 + task_rq(p)->dl.dl_nr_migratory += delta;
22240 +static inline void
22241 +migrate_disable_update_cpus_allowed(struct task_struct *p)
22244 + struct rq_flags rf;
22246 + p->cpus_ptr = cpumask_of(smp_processor_id());
22248 + rq = task_rq_lock(p, &rf);
22249 + update_nr_migratory(p, -1);
22250 + p->nr_cpus_allowed = 1;
22251 + task_rq_unlock(rq, p, &rf);
22254 +static inline void
22255 +migrate_enable_update_cpus_allowed(struct task_struct *p)
22258 + struct rq_flags rf;
22260 + p->cpus_ptr = &p->cpus_mask;
22262 + rq = task_rq_lock(p, &rf);
22263 + p->nr_cpus_allowed = cpumask_weight(&p->cpus_mask);
22264 + update_nr_migratory(p, 1);
22265 + task_rq_unlock(rq, p, &rf);
22268 +void migrate_disable(void)
22270 + struct task_struct *p = current;
22272 + if (in_atomic() || irqs_disabled()) {
22273 +#ifdef CONFIG_SCHED_DEBUG
22274 + p->migrate_disable_atomic++;
22278 +#ifdef CONFIG_SCHED_DEBUG
22279 + if (unlikely(p->migrate_disable_atomic)) {
22285 + if (p->migrate_disable) {
22286 + p->migrate_disable++;
22290 + preempt_disable();
22291 + preempt_lazy_disable();
22292 + pin_current_cpu();
22294 + migrate_disable_update_cpus_allowed(p);
22295 + p->migrate_disable = 1;
22297 + preempt_enable();
22299 +EXPORT_SYMBOL(migrate_disable);
22301 +void migrate_enable(void)
22303 + struct task_struct *p = current;
22305 + if (in_atomic() || irqs_disabled()) {
22306 +#ifdef CONFIG_SCHED_DEBUG
22307 + p->migrate_disable_atomic--;
22312 +#ifdef CONFIG_SCHED_DEBUG
22313 + if (unlikely(p->migrate_disable_atomic)) {
22319 + WARN_ON_ONCE(p->migrate_disable <= 0);
22320 + if (p->migrate_disable > 1) {
22321 + p->migrate_disable--;
22325 + preempt_disable();
22327 + p->migrate_disable = 0;
22328 + migrate_enable_update_cpus_allowed(p);
22330 + if (p->migrate_disable_update) {
22332 + struct rq_flags rf;
22334 + rq = task_rq_lock(p, &rf);
22335 + update_rq_clock(rq);
22337 + __do_set_cpus_allowed_tail(p, &p->cpus_mask);
22338 + task_rq_unlock(rq, p, &rf);
22340 + p->migrate_disable_update = 0;
22342 + WARN_ON(smp_processor_id() != task_cpu(p));
22343 + if (!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
22344 + const struct cpumask *cpu_valid_mask = cpu_active_mask;
22345 + struct migration_arg arg;
22346 + unsigned int dest_cpu;
22348 + if (p->flags & PF_KTHREAD) {
22350 + * Kernel threads are allowed on online && !active CPUs
22352 + cpu_valid_mask = cpu_online_mask;
22354 + dest_cpu = cpumask_any_and(cpu_valid_mask, &p->cpus_mask);
22356 + arg.dest_cpu = dest_cpu;
22358 + unpin_current_cpu();
22359 + preempt_lazy_enable();
22360 + preempt_enable();
22361 + stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
22362 + tlb_migrate_finish(p->mm);
22367 + unpin_current_cpu();
22368 + preempt_lazy_enable();
22369 + preempt_enable();
22371 +EXPORT_SYMBOL(migrate_enable);
22373 +#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
22374 +void migrate_disable(void)
22376 + struct task_struct *p = current;
22378 + if (in_atomic() || irqs_disabled()) {
22379 +#ifdef CONFIG_SCHED_DEBUG
22380 + p->migrate_disable_atomic++;
22384 +#ifdef CONFIG_SCHED_DEBUG
22385 + if (unlikely(p->migrate_disable_atomic)) {
22391 + p->migrate_disable++;
22393 +EXPORT_SYMBOL(migrate_disable);
22395 +void migrate_enable(void)
22397 + struct task_struct *p = current;
22399 + if (in_atomic() || irqs_disabled()) {
22400 +#ifdef CONFIG_SCHED_DEBUG
22401 + p->migrate_disable_atomic--;
22406 +#ifdef CONFIG_SCHED_DEBUG
22407 + if (unlikely(p->migrate_disable_atomic)) {
22413 + WARN_ON_ONCE(p->migrate_disable <= 0);
22414 + p->migrate_disable--;
22416 +EXPORT_SYMBOL(migrate_enable);
22418 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/cpudeadline.c linux-4.14/kernel/sched/cpudeadline.c
22419 --- linux-4.14.orig/kernel/sched/cpudeadline.c 2017-11-12 19:46:13.000000000 +0100
22420 +++ linux-4.14/kernel/sched/cpudeadline.c 2018-09-05 11:05:07.000000000 +0200
22421 @@ -127,13 +127,13 @@
22422 const struct sched_dl_entity *dl_se = &p->dl;
22425 - cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
22426 + cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
22429 int best_cpu = cpudl_maximum(cp);
22430 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
22432 - if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
22433 + if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
22434 dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
22436 cpumask_set_cpu(best_cpu, later_mask);
22437 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/cpupri.c linux-4.14/kernel/sched/cpupri.c
22438 --- linux-4.14.orig/kernel/sched/cpupri.c 2017-11-12 19:46:13.000000000 +0100
22439 +++ linux-4.14/kernel/sched/cpupri.c 2018-09-05 11:05:07.000000000 +0200
22440 @@ -103,11 +103,11 @@
22444 - if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
22445 + if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
22449 - cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
22450 + cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
22453 * We have to ensure that we have at least one bit
22454 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/deadline.c linux-4.14/kernel/sched/deadline.c
22455 --- linux-4.14.orig/kernel/sched/deadline.c 2018-09-05 11:03:22.000000000 +0200
22456 +++ linux-4.14/kernel/sched/deadline.c 2018-09-05 11:05:07.000000000 +0200
22457 @@ -504,7 +504,7 @@
22458 * If we cannot preempt any rq, fall back to pick any
22461 - cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
22462 + cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
22463 if (cpu >= nr_cpu_ids) {
22465 * Fail to find any suitable cpu.
22466 @@ -1020,7 +1020,7 @@
22468 struct hrtimer *timer = &dl_se->dl_timer;
22470 - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
22471 + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
22472 timer->function = dl_task_timer;
22475 @@ -1749,7 +1749,7 @@
22476 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
22478 if (!task_running(rq, p) &&
22479 - cpumask_test_cpu(cpu, &p->cpus_allowed))
22480 + cpumask_test_cpu(cpu, p->cpus_ptr))
22484 @@ -1899,7 +1899,7 @@
22485 /* Retry if something changed. */
22486 if (double_lock_balance(rq, later_rq)) {
22487 if (unlikely(task_rq(task) != rq ||
22488 - !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
22489 + !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
22490 task_running(rq, task) ||
22492 !task_on_rq_queued(task))) {
22493 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/debug.c linux-4.14/kernel/sched/debug.c
22494 --- linux-4.14.orig/kernel/sched/debug.c 2017-11-12 19:46:13.000000000 +0100
22495 +++ linux-4.14/kernel/sched/debug.c 2018-09-05 11:05:07.000000000 +0200
22496 @@ -1017,6 +1017,10 @@
22500 +#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
22501 + P(migrate_disable);
22503 + P(nr_cpus_allowed);
22504 #undef PN_SCHEDSTAT
22507 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/fair.c linux-4.14/kernel/sched/fair.c
22508 --- linux-4.14.orig/kernel/sched/fair.c 2018-09-05 11:03:22.000000000 +0200
22509 +++ linux-4.14/kernel/sched/fair.c 2018-09-05 11:05:07.000000000 +0200
22510 @@ -1596,7 +1596,7 @@
22513 /* Skip this swap candidate if cannot move to the source cpu */
22514 - if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
22515 + if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
22519 @@ -1706,7 +1706,7 @@
22521 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
22522 /* Skip this CPU if the source task cannot migrate */
22523 - if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
22524 + if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
22527 env->dst_cpu = cpu;
22528 @@ -3840,7 +3840,7 @@
22529 ideal_runtime = sched_slice(cfs_rq, curr);
22530 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
22531 if (delta_exec > ideal_runtime) {
22532 - resched_curr(rq_of(cfs_rq));
22533 + resched_curr_lazy(rq_of(cfs_rq));
22535 * The current task ran long enough, ensure it doesn't get
22536 * re-elected due to buddy favours.
22537 @@ -3864,7 +3864,7 @@
22540 if (delta > ideal_runtime)
22541 - resched_curr(rq_of(cfs_rq));
22542 + resched_curr_lazy(rq_of(cfs_rq));
22546 @@ -4006,7 +4006,7 @@
22547 * validating it and just reschedule.
22550 - resched_curr(rq_of(cfs_rq));
22551 + resched_curr_lazy(rq_of(cfs_rq));
22555 @@ -4188,7 +4188,7 @@
22556 * hierarchy can be throttled
22558 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
22559 - resched_curr(rq_of(cfs_rq));
22560 + resched_curr_lazy(rq_of(cfs_rq));
22563 static __always_inline
22564 @@ -4837,7 +4837,7 @@
22568 - resched_curr(rq);
22569 + resched_curr_lazy(rq);
22572 hrtick_start(rq, delta);
22573 @@ -5475,7 +5475,7 @@
22575 /* Skip over this group if it has no CPUs allowed */
22576 if (!cpumask_intersects(sched_group_span(group),
22577 - &p->cpus_allowed))
22581 local_group = cpumask_test_cpu(this_cpu,
22582 @@ -5595,7 +5595,7 @@
22583 return cpumask_first(sched_group_span(group));
22585 /* Traverse only the allowed CPUs */
22586 - for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
22587 + for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
22589 struct rq *rq = cpu_rq(i);
22590 struct cpuidle_state *idle = idle_get_state(rq);
22591 @@ -5698,7 +5698,7 @@
22592 if (!test_idle_cores(target, false))
22595 - cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
22596 + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
22598 for_each_cpu_wrap(core, cpus, target) {
22600 @@ -5732,7 +5732,7 @@
22603 for_each_cpu(cpu, cpu_smt_mask(target)) {
22604 - if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
22605 + if (!cpumask_test_cpu(cpu, p->cpus_ptr))
22609 @@ -5795,7 +5795,7 @@
22610 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
22613 - if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
22614 + if (!cpumask_test_cpu(cpu, p->cpus_ptr))
22618 @@ -5950,7 +5950,7 @@
22619 if (sd_flag & SD_BALANCE_WAKE) {
22621 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
22622 - && cpumask_test_cpu(cpu, &p->cpus_allowed);
22623 + && cpumask_test_cpu(cpu, p->cpus_ptr);
22627 @@ -6231,7 +6231,7 @@
22631 - resched_curr(rq);
22632 + resched_curr_lazy(rq);
22634 * Only set the backward buddy when the current task is still
22635 * on the rq. This can happen when a wakeup gets interleaved
22636 @@ -6699,14 +6699,14 @@
22638 * We do not migrate tasks that are:
22639 * 1) throttled_lb_pair, or
22640 - * 2) cannot be migrated to this CPU due to cpus_allowed, or
22641 + * 2) cannot be migrated to this CPU due to cpus_ptr, or
22642 * 3) running (obviously), or
22643 * 4) are cache-hot on their current CPU.
22645 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
22648 - if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
22649 + if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
22652 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
22653 @@ -6726,7 +6726,7 @@
22655 /* Prevent to re-select dst_cpu via env's cpus */
22656 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
22657 - if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
22658 + if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
22659 env->flags |= LBF_DST_PINNED;
22660 env->new_dst_cpu = cpu;
22662 @@ -7295,7 +7295,7 @@
22665 * Group imbalance indicates (and tries to solve) the problem where balancing
22666 - * groups is inadequate due to ->cpus_allowed constraints.
22667 + * groups is inadequate due to ->cpus_ptr constraints.
22669 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
22670 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
22671 @@ -7871,7 +7871,7 @@
22673 * If the busiest group is imbalanced the below checks don't
22674 * work because they assume all things are equal, which typically
22675 - * isn't true due to cpus_allowed constraints and the like.
22676 + * isn't true due to cpus_ptr constraints and the like.
22678 if (busiest->group_type == group_imbalanced)
22679 goto force_balance;
22680 @@ -8263,7 +8263,7 @@
22681 * if the curr task on busiest cpu can't be
22682 * moved to this_cpu
22684 - if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
22685 + if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
22686 raw_spin_unlock_irqrestore(&busiest->lock,
22688 env.flags |= LBF_ALL_PINNED;
22689 @@ -9085,7 +9085,7 @@
22690 * 'current' within the tree based on its new key value.
22692 swap(curr->vruntime, se->vruntime);
22693 - resched_curr(rq);
22694 + resched_curr_lazy(rq);
22697 se->vruntime -= cfs_rq->min_vruntime;
22698 @@ -9109,7 +9109,7 @@
22700 if (rq->curr == p) {
22701 if (p->prio > oldprio)
22702 - resched_curr(rq);
22703 + resched_curr_lazy(rq);
22705 check_preempt_curr(rq, p, 0);
22707 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/features.h linux-4.14/kernel/sched/features.h
22708 --- linux-4.14.orig/kernel/sched/features.h 2017-11-12 19:46:13.000000000 +0100
22709 +++ linux-4.14/kernel/sched/features.h 2018-09-05 11:05:07.000000000 +0200
22710 @@ -46,11 +46,19 @@
22712 SCHED_FEAT(NONTASK_CAPACITY, true)
22714 +#ifdef CONFIG_PREEMPT_RT_FULL
22715 +SCHED_FEAT(TTWU_QUEUE, false)
22716 +# ifdef CONFIG_PREEMPT_LAZY
22717 +SCHED_FEAT(PREEMPT_LAZY, true)
22722 * Queue remote wakeups on the target CPU and process them
22723 * using the scheduler IPI. Reduces rq->lock contention/bounces.
22725 SCHED_FEAT(TTWU_QUEUE, true)
22729 * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
22730 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/Makefile linux-4.14/kernel/sched/Makefile
22731 --- linux-4.14.orig/kernel/sched/Makefile 2017-11-12 19:46:13.000000000 +0100
22732 +++ linux-4.14/kernel/sched/Makefile 2018-09-05 11:05:07.000000000 +0200
22735 obj-y += core.o loadavg.o clock.o cputime.o
22736 obj-y += idle_task.o fair.o rt.o deadline.o
22737 -obj-y += wait.o wait_bit.o swait.o completion.o idle.o
22738 +obj-y += wait.o wait_bit.o swait.o swork.o completion.o idle.o
22739 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
22740 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
22741 obj-$(CONFIG_SCHEDSTATS) += stats.o
22742 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/rt.c linux-4.14/kernel/sched/rt.c
22743 --- linux-4.14.orig/kernel/sched/rt.c 2018-09-05 11:03:22.000000000 +0200
22744 +++ linux-4.14/kernel/sched/rt.c 2018-09-05 11:05:07.000000000 +0200
22747 raw_spin_lock_init(&rt_b->rt_runtime_lock);
22749 - hrtimer_init(&rt_b->rt_period_timer,
22750 - CLOCK_MONOTONIC, HRTIMER_MODE_REL);
22751 + hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
22752 + HRTIMER_MODE_REL_HARD);
22753 rt_b->rt_period_timer.function = sched_rt_period_timer;
22756 @@ -1594,7 +1594,7 @@
22757 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
22759 if (!task_running(rq, p) &&
22760 - cpumask_test_cpu(cpu, &p->cpus_allowed))
22761 + cpumask_test_cpu(cpu, p->cpus_ptr))
22765 @@ -1729,7 +1729,7 @@
22766 * Also make sure that it wasn't scheduled on its rq.
22768 if (unlikely(task_rq(task) != rq ||
22769 - !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
22770 + !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
22771 task_running(rq, task) ||
22773 !task_on_rq_queued(task))) {
22774 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/sched.h linux-4.14/kernel/sched/sched.h
22775 --- linux-4.14.orig/kernel/sched/sched.h 2018-09-05 11:03:22.000000000 +0200
22776 +++ linux-4.14/kernel/sched/sched.h 2018-09-05 11:05:07.000000000 +0200
22777 @@ -1354,6 +1354,7 @@
22778 #define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
22779 #define WF_FORK 0x02 /* child wakeup after fork */
22780 #define WF_MIGRATED 0x4 /* internal use, task got migrated */
22781 +#define WF_LOCK_SLEEPER 0x08 /* wakeup spinlock "sleeper" */
22784 * To aid in avoiding the subversion of "niceness" due to uneven distribution
22785 @@ -1545,6 +1546,15 @@
22786 extern void resched_curr(struct rq *rq);
22787 extern void resched_cpu(int cpu);
22789 +#ifdef CONFIG_PREEMPT_LAZY
22790 +extern void resched_curr_lazy(struct rq *rq);
22792 +static inline void resched_curr_lazy(struct rq *rq)
22794 + resched_curr(rq);
22798 extern struct rt_bandwidth def_rt_bandwidth;
22799 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
22801 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/swait.c linux-4.14/kernel/sched/swait.c
22802 --- linux-4.14.orig/kernel/sched/swait.c 2017-11-12 19:46:13.000000000 +0100
22803 +++ linux-4.14/kernel/sched/swait.c 2018-09-05 11:05:07.000000000 +0200
22805 // SPDX-License-Identifier: GPL-2.0
22806 #include <linux/sched/signal.h>
22807 #include <linux/swait.h>
22808 +#include <linux/suspend.h>
22810 void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
22811 struct lock_class_key *key)
22814 EXPORT_SYMBOL(swake_up_locked);
22816 +void swake_up_all_locked(struct swait_queue_head *q)
22818 + struct swait_queue *curr;
22821 + while (!list_empty(&q->task_list)) {
22823 + curr = list_first_entry(&q->task_list, typeof(*curr),
22825 + wake_up_process(curr->task);
22826 + list_del_init(&curr->task_list);
22829 + if (pm_in_action)
22831 + WARN(wakes > 2, "complete_all() with %d waiters\n", wakes);
22833 +EXPORT_SYMBOL(swake_up_all_locked);
22835 void swake_up(struct swait_queue_head *q)
22837 unsigned long flags;
22839 struct swait_queue *curr;
22842 + WARN_ON(irqs_disabled());
22843 raw_spin_lock_irq(&q->lock);
22844 list_splice_init(&q->task_list, &tmp);
22845 while (!list_empty(&tmp)) {
22846 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/swork.c linux-4.14/kernel/sched/swork.c
22847 --- linux-4.14.orig/kernel/sched/swork.c 1970-01-01 01:00:00.000000000 +0100
22848 +++ linux-4.14/kernel/sched/swork.c 2018-09-05 11:05:07.000000000 +0200
22851 + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
22853 + * Provides a framework for enqueuing callbacks from irq context
22854 + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
22857 +#include <linux/swait.h>
22858 +#include <linux/swork.h>
22859 +#include <linux/kthread.h>
22860 +#include <linux/slab.h>
22861 +#include <linux/spinlock.h>
22862 +#include <linux/export.h>
22864 +#define SWORK_EVENT_PENDING (1 << 0)
22866 +static DEFINE_MUTEX(worker_mutex);
22867 +static struct sworker *glob_worker;
22870 + struct list_head events;
22871 + struct swait_queue_head wq;
22873 + raw_spinlock_t lock;
22875 + struct task_struct *task;
22879 +static bool swork_readable(struct sworker *worker)
22883 + if (kthread_should_stop())
22886 + raw_spin_lock_irq(&worker->lock);
22887 + r = !list_empty(&worker->events);
22888 + raw_spin_unlock_irq(&worker->lock);
22893 +static int swork_kthread(void *arg)
22895 + struct sworker *worker = arg;
22898 + swait_event_interruptible(worker->wq,
22899 + swork_readable(worker));
22900 + if (kthread_should_stop())
22903 + raw_spin_lock_irq(&worker->lock);
22904 + while (!list_empty(&worker->events)) {
22905 + struct swork_event *sev;
22907 + sev = list_first_entry(&worker->events,
22908 + struct swork_event, item);
22909 + list_del(&sev->item);
22910 + raw_spin_unlock_irq(&worker->lock);
22912 + WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
22915 + raw_spin_lock_irq(&worker->lock);
22917 + raw_spin_unlock_irq(&worker->lock);
22922 +static struct sworker *swork_create(void)
22924 + struct sworker *worker;
22926 + worker = kzalloc(sizeof(*worker), GFP_KERNEL);
22928 + return ERR_PTR(-ENOMEM);
22930 + INIT_LIST_HEAD(&worker->events);
22931 + raw_spin_lock_init(&worker->lock);
22932 + init_swait_queue_head(&worker->wq);
22934 + worker->task = kthread_run(swork_kthread, worker, "kswork");
22935 + if (IS_ERR(worker->task)) {
22937 + return ERR_PTR(-ENOMEM);
22943 +static void swork_destroy(struct sworker *worker)
22945 + kthread_stop(worker->task);
22947 + WARN_ON(!list_empty(&worker->events));
22952 + * swork_queue - queue swork
22954 + * Returns %false if @work was already on a queue, %true otherwise.
22956 + * The work is queued and processed on a random CPU
22958 +bool swork_queue(struct swork_event *sev)
22960 + unsigned long flags;
22962 + if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
22965 + raw_spin_lock_irqsave(&glob_worker->lock, flags);
22966 + list_add_tail(&sev->item, &glob_worker->events);
22967 + raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
22969 + swake_up(&glob_worker->wq);
22972 +EXPORT_SYMBOL_GPL(swork_queue);
22975 + * swork_get - get an instance of the sworker
22977 + * Returns an negative error code if the initialization if the worker did not
22978 + * work, %0 otherwise.
22981 +int swork_get(void)
22983 + struct sworker *worker;
22985 + mutex_lock(&worker_mutex);
22986 + if (!glob_worker) {
22987 + worker = swork_create();
22988 + if (IS_ERR(worker)) {
22989 + mutex_unlock(&worker_mutex);
22993 + glob_worker = worker;
22996 + glob_worker->refs++;
22997 + mutex_unlock(&worker_mutex);
23001 +EXPORT_SYMBOL_GPL(swork_get);
23004 + * swork_put - puts an instance of the sworker
23006 + * Will destroy the sworker thread. This function must not be called until all
23007 + * queued events have been completed.
23009 +void swork_put(void)
23011 + mutex_lock(&worker_mutex);
23013 + glob_worker->refs--;
23014 + if (glob_worker->refs > 0)
23017 + swork_destroy(glob_worker);
23018 + glob_worker = NULL;
23020 + mutex_unlock(&worker_mutex);
23022 +EXPORT_SYMBOL_GPL(swork_put);
23023 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/topology.c linux-4.14/kernel/sched/topology.c
23024 --- linux-4.14.orig/kernel/sched/topology.c 2018-09-05 11:03:22.000000000 +0200
23025 +++ linux-4.14/kernel/sched/topology.c 2018-09-05 11:05:07.000000000 +0200
23026 @@ -286,6 +286,7 @@
23028 raw_spin_lock_init(&rd->rto_lock);
23029 init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
23030 + rd->rto_push_work.flags |= IRQ_WORK_HARD_IRQ;
23033 init_dl_bw(&rd->dl_bw);
23034 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/signal.c linux-4.14/kernel/signal.c
23035 --- linux-4.14.orig/kernel/signal.c 2018-09-05 11:03:22.000000000 +0200
23036 +++ linux-4.14/kernel/signal.c 2018-09-05 11:05:07.000000000 +0200
23038 #include <linux/sched/task.h>
23039 #include <linux/sched/task_stack.h>
23040 #include <linux/sched/cputime.h>
23041 +#include <linux/sched/rt.h>
23042 #include <linux/fs.h>
23043 #include <linux/tty.h>
23044 #include <linux/binfmts.h>
23045 @@ -360,13 +361,30 @@
23049 +static inline struct sigqueue *get_task_cache(struct task_struct *t)
23051 + struct sigqueue *q = t->sigqueue_cache;
23053 + if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
23058 +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
23060 + if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
23066 * allocate a new signal queue record
23067 * - this may be called without locks if and only if t == current, otherwise an
23068 * appropriate lock must be held to stop the target task from exiting
23070 static struct sigqueue *
23071 -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
23072 +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
23073 + int override_rlimit, int fromslab)
23075 struct sigqueue *q = NULL;
23076 struct user_struct *user;
23077 @@ -383,7 +401,10 @@
23078 if (override_rlimit ||
23079 atomic_read(&user->sigpending) <=
23080 task_rlimit(t, RLIMIT_SIGPENDING)) {
23081 - q = kmem_cache_alloc(sigqueue_cachep, flags);
23083 + q = get_task_cache(t);
23085 + q = kmem_cache_alloc(sigqueue_cachep, flags);
23087 print_dropped_signal(sig);
23089 @@ -400,6 +421,13 @@
23093 +static struct sigqueue *
23094 +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
23095 + int override_rlimit)
23097 + return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
23100 static void __sigqueue_free(struct sigqueue *q)
23102 if (q->flags & SIGQUEUE_PREALLOC)
23103 @@ -409,6 +437,21 @@
23104 kmem_cache_free(sigqueue_cachep, q);
23107 +static void sigqueue_free_current(struct sigqueue *q)
23109 + struct user_struct *up;
23111 + if (q->flags & SIGQUEUE_PREALLOC)
23115 + if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
23116 + atomic_dec(&up->sigpending);
23119 + __sigqueue_free(q);
23122 void flush_sigqueue(struct sigpending *queue)
23124 struct sigqueue *q;
23125 @@ -422,6 +465,21 @@
23129 + * Called from __exit_signal. Flush tsk->pending and
23130 + * tsk->sigqueue_cache
23132 +void flush_task_sigqueue(struct task_struct *tsk)
23134 + struct sigqueue *q;
23136 + flush_sigqueue(&tsk->pending);
23138 + q = get_task_cache(tsk);
23140 + kmem_cache_free(sigqueue_cachep, q);
23144 * Flush all pending signals for this kthread.
23146 void flush_signals(struct task_struct *t)
23147 @@ -542,7 +600,7 @@
23148 (info->si_code == SI_TIMER) &&
23149 (info->si_sys_private);
23151 - __sigqueue_free(first);
23152 + sigqueue_free_current(first);
23155 * Ok, it wasn't in the queue. This must be
23156 @@ -578,6 +636,8 @@
23157 bool resched_timer = false;
23160 + WARN_ON_ONCE(tsk != current);
23162 /* We only dequeue private signals from ourselves, we don't let
23163 * signalfd steal them
23165 @@ -1177,8 +1237,8 @@
23166 * We don't want to have recursive SIGSEGV's etc, for example,
23167 * that is why we also clear SIGNAL_UNKILLABLE.
23170 -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23172 +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23174 unsigned long int flags;
23175 int ret, blocked, ignored;
23176 @@ -1207,6 +1267,39 @@
23180 +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23183 + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
23184 + * since it can not enable preemption, and the signal code's spin_locks
23185 + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
23186 + * send the signal on exit of the trap.
23188 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
23189 + if (in_atomic()) {
23190 + if (WARN_ON_ONCE(t != current))
23192 + if (WARN_ON_ONCE(t->forced_info.si_signo))
23195 + if (is_si_special(info)) {
23196 + WARN_ON_ONCE(info != SEND_SIG_PRIV);
23197 + t->forced_info.si_signo = sig;
23198 + t->forced_info.si_errno = 0;
23199 + t->forced_info.si_code = SI_KERNEL;
23200 + t->forced_info.si_pid = 0;
23201 + t->forced_info.si_uid = 0;
23203 + t->forced_info = *info;
23206 + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
23210 + return do_force_sig_info(sig, info, t);
23214 * Nuke all other threads in the group.
23216 @@ -1241,12 +1334,12 @@
23217 * Disable interrupts early to avoid deadlocks.
23218 * See rcu_read_unlock() comment header for details.
23220 - local_irq_save(*flags);
23221 + local_irq_save_nort(*flags);
23223 sighand = rcu_dereference(tsk->sighand);
23224 if (unlikely(sighand == NULL)) {
23226 - local_irq_restore(*flags);
23227 + local_irq_restore_nort(*flags);
23231 @@ -1267,7 +1360,7 @@
23233 spin_unlock(&sighand->siglock);
23235 - local_irq_restore(*flags);
23236 + local_irq_restore_nort(*flags);
23240 @@ -1514,7 +1607,8 @@
23242 struct sigqueue *sigqueue_alloc(void)
23244 - struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
23245 + /* Preallocated sigqueue objects always from the slabcache ! */
23246 + struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
23249 q->flags |= SIGQUEUE_PREALLOC;
23250 @@ -1888,15 +1982,7 @@
23251 if (gstop_done && ptrace_reparented(current))
23252 do_notify_parent_cldstop(current, false, why);
23255 - * Don't want to allow preemption here, because
23256 - * sys_ptrace() needs this task to be inactive.
23258 - * XXX: implement read_unlock_no_resched().
23260 - preempt_disable();
23261 read_unlock(&tasklist_lock);
23262 - preempt_enable_no_resched();
23263 freezable_schedule();
23266 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/softirq.c linux-4.14/kernel/softirq.c
23267 --- linux-4.14.orig/kernel/softirq.c 2018-09-05 11:03:22.000000000 +0200
23268 +++ linux-4.14/kernel/softirq.c 2018-09-05 11:05:07.000000000 +0200
23269 @@ -21,11 +21,14 @@
23270 #include <linux/freezer.h>
23271 #include <linux/kthread.h>
23272 #include <linux/rcupdate.h>
23273 +#include <linux/delay.h>
23274 #include <linux/ftrace.h>
23275 #include <linux/smp.h>
23276 #include <linux/smpboot.h>
23277 #include <linux/tick.h>
23278 +#include <linux/locallock.h>
23279 #include <linux/irq.h>
23280 +#include <linux/sched/types.h>
23282 #define CREATE_TRACE_POINTS
23283 #include <trace/events/irq.h>
23284 @@ -56,12 +59,108 @@
23285 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
23287 DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
23288 +#ifdef CONFIG_PREEMPT_RT_FULL
23289 +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
23290 +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
23293 const char * const softirq_to_name[NR_SOFTIRQS] = {
23294 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
23295 "TASKLET", "SCHED", "HRTIMER", "RCU"
23298 +#ifdef CONFIG_NO_HZ_COMMON
23299 +# ifdef CONFIG_PREEMPT_RT_FULL
23301 +struct softirq_runner {
23302 + struct task_struct *runner[NR_SOFTIRQS];
23305 +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
23307 +static inline void softirq_set_runner(unsigned int sirq)
23309 + struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
23311 + sr->runner[sirq] = current;
23314 +static inline void softirq_clr_runner(unsigned int sirq)
23316 + struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
23318 + sr->runner[sirq] = NULL;
23322 + * On preempt-rt a softirq running context might be blocked on a
23323 + * lock. There might be no other runnable task on this CPU because the
23324 + * lock owner runs on some other CPU. So we have to go into idle with
23325 + * the pending bit set. Therefor we need to check this otherwise we
23326 + * warn about false positives which confuses users and defeats the
23327 + * whole purpose of this test.
23329 + * This code is called with interrupts disabled.
23331 +void softirq_check_pending_idle(void)
23333 + static int rate_limit;
23334 + struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
23338 + if (rate_limit >= 10)
23341 + warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
23342 + for (i = 0; i < NR_SOFTIRQS; i++) {
23343 + struct task_struct *tsk = sr->runner[i];
23346 + * The wakeup code in rtmutex.c wakes up the task
23347 + * _before_ it sets pi_blocked_on to NULL under
23348 + * tsk->pi_lock. So we need to check for both: state
23349 + * and pi_blocked_on.
23352 + raw_spin_lock(&tsk->pi_lock);
23353 + if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
23354 + /* Clear all bits pending in that task */
23355 + warnpending &= ~(tsk->softirqs_raised);
23356 + warnpending &= ~(1 << i);
23358 + raw_spin_unlock(&tsk->pi_lock);
23362 + if (warnpending) {
23363 + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
23370 + * On !PREEMPT_RT we just printk rate limited:
23372 +void softirq_check_pending_idle(void)
23374 + static int rate_limit;
23376 + if (rate_limit < 10 &&
23377 + (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
23378 + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
23379 + local_softirq_pending());
23385 +#else /* !CONFIG_NO_HZ_COMMON */
23386 +static inline void softirq_set_runner(unsigned int sirq) { }
23387 +static inline void softirq_clr_runner(unsigned int sirq) { }
23391 * we cannot loop indefinitely here to avoid userspace starvation,
23392 * but we also don't want to introduce a worst case 1/HZ latency
23393 @@ -77,6 +176,38 @@
23394 wake_up_process(tsk);
23397 +#ifdef CONFIG_PREEMPT_RT_FULL
23398 +static void wakeup_timer_softirqd(void)
23400 + /* Interrupts are disabled: no need to stop preemption */
23401 + struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
23403 + if (tsk && tsk->state != TASK_RUNNING)
23404 + wake_up_process(tsk);
23408 +static void handle_softirq(unsigned int vec_nr)
23410 + struct softirq_action *h = softirq_vec + vec_nr;
23413 + prev_count = preempt_count();
23415 + kstat_incr_softirqs_this_cpu(vec_nr);
23417 + trace_softirq_entry(vec_nr);
23419 + trace_softirq_exit(vec_nr);
23420 + if (unlikely(prev_count != preempt_count())) {
23421 + pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
23422 + vec_nr, softirq_to_name[vec_nr], h->action,
23423 + prev_count, preempt_count());
23424 + preempt_count_set(prev_count);
23428 +#ifndef CONFIG_PREEMPT_RT_FULL
23430 * If ksoftirqd is scheduled, we do not want to process pending softirqs
23431 * right now. Let ksoftirqd handle this at its own rate, to get fairness,
23432 @@ -92,6 +223,47 @@
23433 return tsk && (tsk->state == TASK_RUNNING);
23436 +static inline int ksoftirqd_softirq_pending(void)
23438 + return local_softirq_pending();
23441 +static void handle_pending_softirqs(u32 pending)
23443 + struct softirq_action *h = softirq_vec;
23446 + local_irq_enable();
23450 + while ((softirq_bit = ffs(pending))) {
23451 + unsigned int vec_nr;
23453 + h += softirq_bit - 1;
23454 + vec_nr = h - softirq_vec;
23455 + handle_softirq(vec_nr);
23458 + pending >>= softirq_bit;
23462 + local_irq_disable();
23465 +static void run_ksoftirqd(unsigned int cpu)
23467 + local_irq_disable();
23468 + if (ksoftirqd_softirq_pending()) {
23470 + local_irq_enable();
23471 + cond_resched_rcu_qs();
23474 + local_irq_enable();
23478 * preempt_count and SOFTIRQ_OFFSET usage:
23479 * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
23480 @@ -247,10 +419,8 @@
23481 unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
23482 unsigned long old_flags = current->flags;
23483 int max_restart = MAX_SOFTIRQ_RESTART;
23484 - struct softirq_action *h;
23490 * Mask out PF_MEMALLOC s current task context is borrowed for the
23491 @@ -269,36 +439,7 @@
23492 /* Reset the pending bitmask before enabling irqs */
23493 set_softirq_pending(0);
23495 - local_irq_enable();
23499 - while ((softirq_bit = ffs(pending))) {
23500 - unsigned int vec_nr;
23503 - h += softirq_bit - 1;
23505 - vec_nr = h - softirq_vec;
23506 - prev_count = preempt_count();
23508 - kstat_incr_softirqs_this_cpu(vec_nr);
23510 - trace_softirq_entry(vec_nr);
23512 - trace_softirq_exit(vec_nr);
23513 - if (unlikely(prev_count != preempt_count())) {
23514 - pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
23515 - vec_nr, softirq_to_name[vec_nr], h->action,
23516 - prev_count, preempt_count());
23517 - preempt_count_set(prev_count);
23520 - pending >>= softirq_bit;
23524 - local_irq_disable();
23525 + handle_pending_softirqs(pending);
23527 pending = local_softirq_pending();
23529 @@ -335,6 +476,309 @@
23533 + * This function must run with irqs disabled!
23535 +void raise_softirq_irqoff(unsigned int nr)
23537 + __raise_softirq_irqoff(nr);
23540 + * If we're in an interrupt or softirq, we're done
23541 + * (this also catches softirq-disabled code). We will
23542 + * actually run the softirq once we return from
23543 + * the irq or softirq.
23545 + * Otherwise we wake up ksoftirqd to make sure we
23546 + * schedule the softirq soon.
23548 + if (!in_interrupt())
23549 + wakeup_softirqd();
23552 +void __raise_softirq_irqoff(unsigned int nr)
23554 + trace_softirq_raise(nr);
23555 + or_softirq_pending(1UL << nr);
23558 +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
23559 +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
23560 +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
23562 +#else /* !PREEMPT_RT_FULL */
23565 + * On RT we serialize softirq execution with a cpu local lock per softirq
23567 +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
23569 +void __init softirq_early_init(void)
23573 + for (i = 0; i < NR_SOFTIRQS; i++)
23574 + local_irq_lock_init(local_softirq_locks[i]);
23577 +static void lock_softirq(int which)
23579 + local_lock(local_softirq_locks[which]);
23582 +static void unlock_softirq(int which)
23584 + local_unlock(local_softirq_locks[which]);
23587 +static void do_single_softirq(int which)
23589 + unsigned long old_flags = current->flags;
23591 + current->flags &= ~PF_MEMALLOC;
23592 + vtime_account_irq_enter(current);
23593 + current->flags |= PF_IN_SOFTIRQ;
23594 + lockdep_softirq_enter();
23595 + local_irq_enable();
23596 + handle_softirq(which);
23597 + local_irq_disable();
23598 + lockdep_softirq_exit();
23599 + current->flags &= ~PF_IN_SOFTIRQ;
23600 + vtime_account_irq_enter(current);
23601 + current_restore_flags(old_flags, PF_MEMALLOC);
23605 + * Called with interrupts disabled. Process softirqs which were raised
23606 + * in current context (or on behalf of ksoftirqd).
23608 +static void do_current_softirqs(void)
23610 + while (current->softirqs_raised) {
23611 + int i = __ffs(current->softirqs_raised);
23612 + unsigned int pending, mask = (1U << i);
23614 + current->softirqs_raised &= ~mask;
23615 + local_irq_enable();
23618 + * If the lock is contended, we boost the owner to
23619 + * process the softirq or leave the critical section
23623 + local_irq_disable();
23624 + softirq_set_runner(i);
23626 + * Check with the local_softirq_pending() bits,
23627 + * whether we need to process this still or if someone
23628 + * else took care of it.
23630 + pending = local_softirq_pending();
23631 + if (pending & mask) {
23632 + set_softirq_pending(pending & ~mask);
23633 + do_single_softirq(i);
23635 + softirq_clr_runner(i);
23636 + WARN_ON(current->softirq_nestcnt != 1);
23637 + local_irq_enable();
23638 + unlock_softirq(i);
23639 + local_irq_disable();
23643 +void __local_bh_disable(void)
23645 + if (++current->softirq_nestcnt == 1)
23646 + migrate_disable();
23648 +EXPORT_SYMBOL(__local_bh_disable);
23650 +void __local_bh_enable(void)
23652 + if (WARN_ON(current->softirq_nestcnt == 0))
23655 + local_irq_disable();
23656 + if (current->softirq_nestcnt == 1 && current->softirqs_raised)
23657 + do_current_softirqs();
23658 + local_irq_enable();
23660 + if (--current->softirq_nestcnt == 0)
23661 + migrate_enable();
23663 +EXPORT_SYMBOL(__local_bh_enable);
23665 +void _local_bh_enable(void)
23667 + if (WARN_ON(current->softirq_nestcnt == 0))
23669 + if (--current->softirq_nestcnt == 0)
23670 + migrate_enable();
23672 +EXPORT_SYMBOL(_local_bh_enable);
23674 +int in_serving_softirq(void)
23676 + return current->flags & PF_IN_SOFTIRQ;
23678 +EXPORT_SYMBOL(in_serving_softirq);
23680 +/* Called with preemption disabled */
23681 +static void run_ksoftirqd(unsigned int cpu)
23683 + local_irq_disable();
23684 + current->softirq_nestcnt++;
23686 + do_current_softirqs();
23687 + current->softirq_nestcnt--;
23688 + local_irq_enable();
23689 + cond_resched_rcu_qs();
23693 + * Called from netif_rx_ni(). Preemption enabled, but migration
23694 + * disabled. So the cpu can't go away under us.
23696 +void thread_do_softirq(void)
23698 + if (!in_serving_softirq() && current->softirqs_raised) {
23699 + current->softirq_nestcnt++;
23700 + do_current_softirqs();
23701 + current->softirq_nestcnt--;
23705 +static void do_raise_softirq_irqoff(unsigned int nr)
23707 + unsigned int mask;
23709 + mask = 1UL << nr;
23711 + trace_softirq_raise(nr);
23712 + or_softirq_pending(mask);
23715 + * If we are not in a hard interrupt and inside a bh disabled
23716 + * region, we simply raise the flag on current. local_bh_enable()
23717 + * will make sure that the softirq is executed. Otherwise we
23718 + * delegate it to ksoftirqd.
23720 + if (!in_irq() && current->softirq_nestcnt)
23721 + current->softirqs_raised |= mask;
23722 + else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
23725 + if (mask & TIMER_SOFTIRQS)
23726 + __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
23728 + __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
23731 +static void wakeup_proper_softirq(unsigned int nr)
23733 + if ((1UL << nr) & TIMER_SOFTIRQS)
23734 + wakeup_timer_softirqd();
23736 + wakeup_softirqd();
23739 +void __raise_softirq_irqoff(unsigned int nr)
23741 + do_raise_softirq_irqoff(nr);
23742 + if (!in_irq() && !current->softirq_nestcnt)
23743 + wakeup_proper_softirq(nr);
23747 + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
23749 +void __raise_softirq_irqoff_ksoft(unsigned int nr)
23751 + unsigned int mask;
23753 + if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
23754 + !__this_cpu_read(ktimer_softirqd)))
23756 + mask = 1UL << nr;
23758 + trace_softirq_raise(nr);
23759 + or_softirq_pending(mask);
23760 + if (mask & TIMER_SOFTIRQS)
23761 + __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
23763 + __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
23764 + wakeup_proper_softirq(nr);
23768 + * This function must run with irqs disabled!
23770 +void raise_softirq_irqoff(unsigned int nr)
23772 + do_raise_softirq_irqoff(nr);
23775 + * If we're in an hard interrupt we let irq return code deal
23776 + * with the wakeup of ksoftirqd.
23781 + * If we are in thread context but outside of a bh disabled
23782 + * region, we need to wake ksoftirqd as well.
23784 + * CHECKME: Some of the places which do that could be wrapped
23785 + * into local_bh_disable/enable pairs. Though it's unclear
23786 + * whether this is worth the effort. To find those places just
23787 + * raise a WARN() if the condition is met.
23789 + if (!current->softirq_nestcnt)
23790 + wakeup_proper_softirq(nr);
23793 +static inline int ksoftirqd_softirq_pending(void)
23795 + return current->softirqs_raised;
23798 +static inline void local_bh_disable_nort(void) { }
23799 +static inline void _local_bh_enable_nort(void) { }
23801 +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
23803 + /* Take over all but timer pending softirqs when starting */
23804 + local_irq_disable();
23805 + current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
23806 + local_irq_enable();
23809 +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
23811 + struct sched_param param = { .sched_priority = 1 };
23813 + sched_setscheduler(current, SCHED_FIFO, ¶m);
23815 + /* Take over timer pending softirqs when starting */
23816 + local_irq_disable();
23817 + current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
23818 + local_irq_enable();
23821 +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
23824 + struct sched_param param = { .sched_priority = 0 };
23826 + sched_setscheduler(current, SCHED_NORMAL, ¶m);
23829 +static int ktimer_softirqd_should_run(unsigned int cpu)
23831 + return current->softirqs_raised;
23834 +#endif /* PREEMPT_RT_FULL */
23836 * Enter an interrupt context.
23838 void irq_enter(void)
23839 @@ -345,9 +789,9 @@
23840 * Prevent raise_softirq from needlessly waking up ksoftirqd
23841 * here, as softirq will be serviced on return from interrupt.
23843 - local_bh_disable();
23844 + local_bh_disable_nort();
23846 - _local_bh_enable();
23847 + _local_bh_enable_nort();
23851 @@ -355,6 +799,7 @@
23853 static inline void invoke_softirq(void)
23855 +#ifndef CONFIG_PREEMPT_RT_FULL
23856 if (ksoftirqd_running(local_softirq_pending()))
23859 @@ -377,6 +822,18 @@
23863 +#else /* PREEMPT_RT_FULL */
23864 + unsigned long flags;
23866 + local_irq_save(flags);
23867 + if (__this_cpu_read(ksoftirqd) &&
23868 + __this_cpu_read(ksoftirqd)->softirqs_raised)
23869 + wakeup_softirqd();
23870 + if (__this_cpu_read(ktimer_softirqd) &&
23871 + __this_cpu_read(ktimer_softirqd)->softirqs_raised)
23872 + wakeup_timer_softirqd();
23873 + local_irq_restore(flags);
23877 static inline void tick_irq_exit(void)
23878 @@ -385,7 +842,13 @@
23879 int cpu = smp_processor_id();
23881 /* Make sure that timer wheel updates are propagated */
23882 - if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
23883 +#ifdef CONFIG_PREEMPT_RT_BASE
23884 + if ((idle_cpu(cpu) || tick_nohz_full_cpu(cpu)) &&
23885 + !need_resched() && !local_softirq_pending())
23887 + if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu))
23891 tick_nohz_irq_exit();
23893 @@ -413,26 +876,6 @@
23894 trace_hardirq_exit(); /* must be last! */
23898 - * This function must run with irqs disabled!
23900 -inline void raise_softirq_irqoff(unsigned int nr)
23902 - __raise_softirq_irqoff(nr);
23905 - * If we're in an interrupt or softirq, we're done
23906 - * (this also catches softirq-disabled code). We will
23907 - * actually run the softirq once we return from
23908 - * the irq or softirq.
23910 - * Otherwise we wake up ksoftirqd to make sure we
23911 - * schedule the softirq soon.
23913 - if (!in_interrupt())
23914 - wakeup_softirqd();
23917 void raise_softirq(unsigned int nr)
23919 unsigned long flags;
23920 @@ -442,12 +885,6 @@
23921 local_irq_restore(flags);
23924 -void __raise_softirq_irqoff(unsigned int nr)
23926 - trace_softirq_raise(nr);
23927 - or_softirq_pending(1UL << nr);
23930 void open_softirq(int nr, void (*action)(struct softirq_action *))
23932 softirq_vec[nr].action = action;
23933 @@ -464,15 +901,45 @@
23934 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
23935 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
23937 +static void inline
23938 +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
23940 + if (tasklet_trylock(t)) {
23942 + /* We may have been preempted before tasklet_trylock
23943 + * and __tasklet_action may have already run.
23944 + * So double check the sched bit while the takslet
23945 + * is locked before adding it to the list.
23947 + if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
23950 + head->tail = &(t->next);
23951 + raise_softirq_irqoff(nr);
23952 + tasklet_unlock(t);
23954 + /* This is subtle. If we hit the corner case above
23955 + * It is possible that we get preempted right here,
23956 + * and another task has successfully called
23957 + * tasklet_schedule(), then this function, and
23958 + * failed on the trylock. Thus we must be sure
23959 + * before releasing the tasklet lock, that the
23960 + * SCHED_BIT is clear. Otherwise the tasklet
23961 + * may get its SCHED_BIT set, but not added to the
23964 + if (!tasklet_tryunlock(t))
23970 void __tasklet_schedule(struct tasklet_struct *t)
23972 unsigned long flags;
23974 local_irq_save(flags);
23976 - *__this_cpu_read(tasklet_vec.tail) = t;
23977 - __this_cpu_write(tasklet_vec.tail, &(t->next));
23978 - raise_softirq_irqoff(TASKLET_SOFTIRQ);
23979 + __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
23980 local_irq_restore(flags);
23982 EXPORT_SYMBOL(__tasklet_schedule);
23983 @@ -482,50 +949,108 @@
23984 unsigned long flags;
23986 local_irq_save(flags);
23988 - *__this_cpu_read(tasklet_hi_vec.tail) = t;
23989 - __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
23990 - raise_softirq_irqoff(HI_SOFTIRQ);
23991 + __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
23992 local_irq_restore(flags);
23994 EXPORT_SYMBOL(__tasklet_hi_schedule);
23996 -static __latent_entropy void tasklet_action(struct softirq_action *a)
23997 +void tasklet_enable(struct tasklet_struct *t)
23999 - struct tasklet_struct *list;
24000 + if (!atomic_dec_and_test(&t->count))
24002 + if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
24003 + tasklet_schedule(t);
24005 +EXPORT_SYMBOL(tasklet_enable);
24007 - local_irq_disable();
24008 - list = __this_cpu_read(tasklet_vec.head);
24009 - __this_cpu_write(tasklet_vec.head, NULL);
24010 - __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
24011 - local_irq_enable();
24012 +static void __tasklet_action(struct softirq_action *a,
24013 + struct tasklet_struct *list)
24015 + int loops = 1000000;
24018 struct tasklet_struct *t = list;
24022 - if (tasklet_trylock(t)) {
24023 - if (!atomic_read(&t->count)) {
24024 - if (!test_and_clear_bit(TASKLET_STATE_SCHED,
24027 - t->func(t->data);
24028 - tasklet_unlock(t);
24031 - tasklet_unlock(t);
24033 + * Should always succeed - after a tasklist got on the
24034 + * list (after getting the SCHED bit set from 0 to 1),
24035 + * nothing but the tasklet softirq it got queued to can
24038 + if (!tasklet_trylock(t)) {
24043 - local_irq_disable();
24045 - *__this_cpu_read(tasklet_vec.tail) = t;
24046 - __this_cpu_write(tasklet_vec.tail, &(t->next));
24047 - __raise_softirq_irqoff(TASKLET_SOFTIRQ);
24048 - local_irq_enable();
24051 + * If we cannot handle the tasklet because it's disabled,
24052 + * mark it as pending. tasklet_enable() will later
24053 + * re-schedule the tasklet.
24055 + if (unlikely(atomic_read(&t->count))) {
24057 + /* implicit unlock: */
24059 + t->state = TASKLET_STATEF_PENDING;
24064 + * After this point on the tasklet might be rescheduled
24065 + * on another CPU, but it can only be added to another
24066 + * CPU's tasklet list if we unlock the tasklet (which we
24069 + if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
24073 + t->func(t->data);
24076 + * Try to unlock the tasklet. We must use cmpxchg, because
24077 + * another CPU might have scheduled or disabled the tasklet.
24078 + * We only allow the STATE_RUN -> 0 transition here.
24080 + while (!tasklet_tryunlock(t)) {
24082 + * If it got disabled meanwhile, bail out:
24084 + if (atomic_read(&t->count))
24085 + goto out_disabled;
24087 + * If it got scheduled meanwhile, re-execute
24088 + * the tasklet function:
24090 + if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
24093 + printk("hm, tasklet state: %08lx\n", t->state);
24095 + tasklet_unlock(t);
24102 +static __latent_entropy void tasklet_action(struct softirq_action *a)
24104 + struct tasklet_struct *list;
24106 + local_irq_disable();
24107 + list = __this_cpu_read(tasklet_vec.head);
24108 + __this_cpu_write(tasklet_vec.head, NULL);
24109 + __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
24110 + local_irq_enable();
24112 + __tasklet_action(a, list);
24115 static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
24117 struct tasklet_struct *list;
24118 @@ -536,30 +1061,7 @@
24119 __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
24120 local_irq_enable();
24123 - struct tasklet_struct *t = list;
24125 - list = list->next;
24127 - if (tasklet_trylock(t)) {
24128 - if (!atomic_read(&t->count)) {
24129 - if (!test_and_clear_bit(TASKLET_STATE_SCHED,
24132 - t->func(t->data);
24133 - tasklet_unlock(t);
24136 - tasklet_unlock(t);
24139 - local_irq_disable();
24141 - *__this_cpu_read(tasklet_hi_vec.tail) = t;
24142 - __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
24143 - __raise_softirq_irqoff(HI_SOFTIRQ);
24144 - local_irq_enable();
24146 + __tasklet_action(a, list);
24149 void tasklet_init(struct tasklet_struct *t,
24150 @@ -580,7 +1082,7 @@
24152 while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
24156 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
24158 tasklet_unlock_wait(t);
24159 @@ -588,57 +1090,6 @@
24161 EXPORT_SYMBOL(tasklet_kill);
24164 - * tasklet_hrtimer
24168 - * The trampoline is called when the hrtimer expires. It schedules a tasklet
24169 - * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
24170 - * hrtimer callback, but from softirq context.
24172 -static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
24174 - struct tasklet_hrtimer *ttimer =
24175 - container_of(timer, struct tasklet_hrtimer, timer);
24177 - tasklet_hi_schedule(&ttimer->tasklet);
24178 - return HRTIMER_NORESTART;
24182 - * Helper function which calls the hrtimer callback from
24183 - * tasklet/softirq context
24185 -static void __tasklet_hrtimer_trampoline(unsigned long data)
24187 - struct tasklet_hrtimer *ttimer = (void *)data;
24188 - enum hrtimer_restart restart;
24190 - restart = ttimer->function(&ttimer->timer);
24191 - if (restart != HRTIMER_NORESTART)
24192 - hrtimer_restart(&ttimer->timer);
24196 - * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
24197 - * @ttimer: tasklet_hrtimer which is initialized
24198 - * @function: hrtimer callback function which gets called from softirq context
24199 - * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
24200 - * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
24202 -void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
24203 - enum hrtimer_restart (*function)(struct hrtimer *),
24204 - clockid_t which_clock, enum hrtimer_mode mode)
24206 - hrtimer_init(&ttimer->timer, which_clock, mode);
24207 - ttimer->timer.function = __hrtimer_tasklet_trampoline;
24208 - tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
24209 - (unsigned long)ttimer);
24210 - ttimer->function = function;
24212 -EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
24214 void __init softirq_init(void)
24217 @@ -654,25 +1105,26 @@
24218 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
24221 -static int ksoftirqd_should_run(unsigned int cpu)
24223 - return local_softirq_pending();
24226 -static void run_ksoftirqd(unsigned int cpu)
24227 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
24228 +void tasklet_unlock_wait(struct tasklet_struct *t)
24230 - local_irq_disable();
24231 - if (local_softirq_pending()) {
24232 + while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
24234 - * We can safely run softirq on inline stack, as we are not deep
24235 - * in the task stack here.
24236 + * Hack for now to avoid this busy-loop:
24239 - local_irq_enable();
24240 - cond_resched_rcu_qs();
24242 +#ifdef CONFIG_PREEMPT_RT_FULL
24248 - local_irq_enable();
24250 +EXPORT_SYMBOL(tasklet_unlock_wait);
24253 +static int ksoftirqd_should_run(unsigned int cpu)
24255 + return ksoftirqd_softirq_pending();
24258 #ifdef CONFIG_HOTPLUG_CPU
24259 @@ -739,17 +1191,31 @@
24261 static struct smp_hotplug_thread softirq_threads = {
24262 .store = &ksoftirqd,
24263 + .setup = ksoftirqd_set_sched_params,
24264 .thread_should_run = ksoftirqd_should_run,
24265 .thread_fn = run_ksoftirqd,
24266 .thread_comm = "ksoftirqd/%u",
24269 +#ifdef CONFIG_PREEMPT_RT_FULL
24270 +static struct smp_hotplug_thread softirq_timer_threads = {
24271 + .store = &ktimer_softirqd,
24272 + .setup = ktimer_softirqd_set_sched_params,
24273 + .cleanup = ktimer_softirqd_clr_sched_params,
24274 + .thread_should_run = ktimer_softirqd_should_run,
24275 + .thread_fn = run_ksoftirqd,
24276 + .thread_comm = "ktimersoftd/%u",
24280 static __init int spawn_ksoftirqd(void)
24282 cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
24283 takeover_tasklets);
24284 BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
24286 +#ifdef CONFIG_PREEMPT_RT_FULL
24287 + BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
24291 early_initcall(spawn_ksoftirqd);
24292 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/stop_machine.c linux-4.14/kernel/stop_machine.c
24293 --- linux-4.14.orig/kernel/stop_machine.c 2018-09-05 11:03:22.000000000 +0200
24294 +++ linux-4.14/kernel/stop_machine.c 2018-09-05 11:05:07.000000000 +0200
24295 @@ -496,6 +496,8 @@
24296 struct cpu_stop_done *done = work->done;
24301 /* cpu stop callbacks must not sleep, make in_atomic() == T */
24302 preempt_count_inc();
24304 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/alarmtimer.c linux-4.14/kernel/time/alarmtimer.c
24305 --- linux-4.14.orig/kernel/time/alarmtimer.c 2018-09-05 11:03:22.000000000 +0200
24306 +++ linux-4.14/kernel/time/alarmtimer.c 2018-09-05 11:05:07.000000000 +0200
24307 @@ -436,7 +436,7 @@
24308 int ret = alarm_try_to_cancel(alarm);
24312 + hrtimer_wait_for_timer(&alarm->timer);
24315 EXPORT_SYMBOL_GPL(alarm_cancel);
24316 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/hrtimer.c linux-4.14/kernel/time/hrtimer.c
24317 --- linux-4.14.orig/kernel/time/hrtimer.c 2018-09-05 11:03:22.000000000 +0200
24318 +++ linux-4.14/kernel/time/hrtimer.c 2018-09-05 11:05:07.000000000 +0200
24320 #include "tick-internal.h"
24323 + * Masks for selecting the soft and hard context timers from
24324 + * cpu_base->active
24326 +#define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT)
24327 +#define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1)
24328 +#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
24329 +#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
24334 * There are more clockids than hrtimer bases. Thus, we index
24336 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
24338 .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
24339 - .seq = SEQCNT_ZERO(hrtimer_bases.seq),
24343 @@ -93,6 +101,26 @@
24344 .clockid = CLOCK_TAI,
24345 .get_time = &ktime_get_clocktai,
24348 + .index = HRTIMER_BASE_MONOTONIC_SOFT,
24349 + .clockid = CLOCK_MONOTONIC,
24350 + .get_time = &ktime_get,
24353 + .index = HRTIMER_BASE_REALTIME_SOFT,
24354 + .clockid = CLOCK_REALTIME,
24355 + .get_time = &ktime_get_real,
24358 + .index = HRTIMER_BASE_BOOTTIME_SOFT,
24359 + .clockid = CLOCK_BOOTTIME,
24360 + .get_time = &ktime_get_boottime,
24363 + .index = HRTIMER_BASE_TAI_SOFT,
24364 + .clockid = CLOCK_TAI,
24365 + .get_time = &ktime_get_clocktai,
24370 @@ -118,7 +146,6 @@
24371 * timer->base->cpu_base
24373 static struct hrtimer_cpu_base migration_cpu_base = {
24374 - .seq = SEQCNT_ZERO(migration_cpu_base),
24375 .clock_base = { { .cpu_base = &migration_cpu_base, }, },
24378 @@ -156,45 +183,33 @@
24382 - * With HIGHRES=y we do not migrate the timer when it is expiring
24383 - * before the next event on the target cpu because we cannot reprogram
24384 - * the target cpu hardware and we would cause it to fire late.
24385 + * We do not migrate the timer when it is expiring before the next
24386 + * event on the target cpu. When high resolution is enabled, we cannot
24387 + * reprogram the target cpu hardware and we would cause it to fire
24388 + * late. To keep it simple, we handle the high resolution enabled and
24389 + * disabled case similar.
24391 * Called with cpu_base->lock of target cpu held.
24394 hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
24396 -#ifdef CONFIG_HIGH_RES_TIMERS
24399 - if (!new_base->cpu_base->hres_active)
24402 expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
24403 - return expires <= new_base->cpu_base->expires_next;
24407 + return expires < new_base->cpu_base->expires_next;
24410 -#ifdef CONFIG_NO_HZ_COMMON
24412 -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
24415 - if (pinned || !base->migration_enabled)
24417 - return &per_cpu(hrtimer_bases, get_nohz_timer_target());
24421 struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
24424 +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
24425 + if (static_branch_unlikely(&timers_migration_enabled) && !pinned)
24426 + return &per_cpu(hrtimer_bases, get_nohz_timer_target());
24433 * We switch the timer base to a power-optimized selected CPU target,
24434 @@ -396,7 +411,8 @@
24435 debug_object_init(timer, &hrtimer_debug_descr);
24438 -static inline void debug_hrtimer_activate(struct hrtimer *timer)
24439 +static inline void debug_hrtimer_activate(struct hrtimer *timer,
24440 + enum hrtimer_mode mode)
24442 debug_object_activate(timer, &hrtimer_debug_descr);
24444 @@ -429,8 +445,10 @@
24445 EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
24449 static inline void debug_hrtimer_init(struct hrtimer *timer) { }
24450 -static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
24451 +static inline void debug_hrtimer_activate(struct hrtimer *timer,
24452 + enum hrtimer_mode mode) { }
24453 static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
24456 @@ -442,10 +460,11 @@
24457 trace_hrtimer_init(timer, clockid, mode);
24460 -static inline void debug_activate(struct hrtimer *timer)
24461 +static inline void debug_activate(struct hrtimer *timer,
24462 + enum hrtimer_mode mode)
24464 - debug_hrtimer_activate(timer);
24465 - trace_hrtimer_start(timer);
24466 + debug_hrtimer_activate(timer, mode);
24467 + trace_hrtimer_start(timer, mode);
24470 static inline void debug_deactivate(struct hrtimer *timer)
24471 @@ -454,35 +473,43 @@
24472 trace_hrtimer_cancel(timer);
24475 -#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
24476 -static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
24477 - struct hrtimer *timer)
24478 +static struct hrtimer_clock_base *
24479 +__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
24481 -#ifdef CONFIG_HIGH_RES_TIMERS
24482 - cpu_base->next_timer = timer;
24484 + unsigned int idx;
24489 + idx = __ffs(*active);
24490 + *active &= ~(1U << idx);
24492 + return &cpu_base->clock_base[idx];
24495 -static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
24496 +#define for_each_active_base(base, cpu_base, active) \
24497 + while ((base = __next_base((cpu_base), &(active))))
24499 +static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
24500 + unsigned int active,
24501 + ktime_t expires_next)
24503 - struct hrtimer_clock_base *base = cpu_base->clock_base;
24504 - unsigned int active = cpu_base->active_bases;
24505 - ktime_t expires, expires_next = KTIME_MAX;
24506 + struct hrtimer_clock_base *base;
24509 - hrtimer_update_next_timer(cpu_base, NULL);
24510 - for (; active; base++, active >>= 1) {
24511 + for_each_active_base(base, cpu_base, active) {
24512 struct timerqueue_node *next;
24513 struct hrtimer *timer;
24515 - if (!(active & 0x01))
24518 next = timerqueue_getnext(&base->active);
24519 timer = container_of(next, struct hrtimer, node);
24520 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
24521 if (expires < expires_next) {
24522 expires_next = expires;
24523 - hrtimer_update_next_timer(cpu_base, timer);
24524 + if (timer->is_soft)
24525 + cpu_base->softirq_next_timer = timer;
24527 + cpu_base->next_timer = timer;
24531 @@ -494,7 +521,47 @@
24533 return expires_next;
24538 + * Recomputes cpu_base::*next_timer and returns the earliest expires_next but
24539 + * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram.
24541 + * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
24542 + * those timers will get run whenever the softirq gets handled, at the end of
24543 + * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
24545 + * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
24546 + * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
24547 + * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
24549 + * @active_mask must be one of:
24550 + * - HRTIMER_ACTIVE_ALL,
24551 + * - HRTIMER_ACTIVE_SOFT, or
24552 + * - HRTIMER_ACTIVE_HARD.
24555 +__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
24557 + unsigned int active;
24558 + struct hrtimer *next_timer = NULL;
24559 + ktime_t expires_next = KTIME_MAX;
24561 + if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
24562 + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
24563 + cpu_base->softirq_next_timer = NULL;
24564 + expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX);
24566 + next_timer = cpu_base->softirq_next_timer;
24569 + if (active_mask & HRTIMER_ACTIVE_HARD) {
24570 + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
24571 + cpu_base->next_timer = next_timer;
24572 + expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next);
24575 + return expires_next;
24578 static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
24580 @@ -502,36 +569,14 @@
24581 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
24582 ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
24584 - return ktime_get_update_offsets_now(&base->clock_was_set_seq,
24585 + ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
24586 offs_real, offs_boot, offs_tai);
24589 -/* High resolution timer related functions */
24590 -#ifdef CONFIG_HIGH_RES_TIMERS
24593 - * High resolution timer enabled ?
24595 -static bool hrtimer_hres_enabled __read_mostly = true;
24596 -unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
24597 -EXPORT_SYMBOL_GPL(hrtimer_resolution);
24600 - * Enable / Disable high resolution mode
24602 -static int __init setup_hrtimer_hres(char *str)
24604 - return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
24607 -__setup("highres=", setup_hrtimer_hres);
24608 + base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
24609 + base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
24610 + base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;
24613 - * hrtimer_high_res_enabled - query, if the highres mode is enabled
24615 -static inline int hrtimer_is_hres_enabled(void)
24617 - return hrtimer_hres_enabled;
24622 @@ -539,7 +584,8 @@
24624 static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
24626 - return cpu_base->hres_active;
24627 + return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
24628 + cpu_base->hres_active : 0;
24631 static inline int hrtimer_hres_active(void)
24632 @@ -557,10 +603,23 @@
24634 ktime_t expires_next;
24636 - if (!cpu_base->hres_active)
24639 + * Find the current next expiration time.
24641 + expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
24643 - expires_next = __hrtimer_get_next_event(cpu_base);
24644 + if (cpu_base->next_timer && cpu_base->next_timer->is_soft) {
24646 + * When the softirq is activated, hrtimer has to be
24647 + * programmed with the first hard hrtimer because soft
24648 + * timer interrupt could occur too late.
24650 + if (cpu_base->softirq_activated)
24651 + expires_next = __hrtimer_get_next_event(cpu_base,
24652 + HRTIMER_ACTIVE_HARD);
24654 + cpu_base->softirq_expires_next = expires_next;
24657 if (skip_equal && expires_next == cpu_base->expires_next)
24659 @@ -568,6 +627,9 @@
24660 cpu_base->expires_next = expires_next;
24663 + * If hres is not active, hardware does not have to be
24664 + * reprogrammed yet.
24666 * If a hang was detected in the last timer interrupt then we
24667 * leave the hang delay active in the hardware. We want the
24668 * system to make progress. That also prevents the following
24669 @@ -581,83 +643,38 @@
24670 * set. So we'd effectivly block all timers until the T2 event
24673 - if (cpu_base->hang_detected)
24674 + if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
24677 tick_program_event(cpu_base->expires_next, 1);
24680 +/* High resolution timer related functions */
24681 +#ifdef CONFIG_HIGH_RES_TIMERS
24684 - * When a timer is enqueued and expires earlier than the already enqueued
24685 - * timers, we have to check, whether it expires earlier than the timer for
24686 - * which the clock event device was armed.
24688 - * Called with interrupts disabled and base->cpu_base.lock held
24689 + * High resolution timer enabled ?
24691 -static void hrtimer_reprogram(struct hrtimer *timer,
24692 - struct hrtimer_clock_base *base)
24694 - struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
24695 - ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
24697 - WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
24700 - * If the timer is not on the current cpu, we cannot reprogram
24701 - * the other cpus clock event device.
24703 - if (base->cpu_base != cpu_base)
24707 - * If the hrtimer interrupt is running, then it will
24708 - * reevaluate the clock bases and reprogram the clock event
24709 - * device. The callbacks are always executed in hard interrupt
24710 - * context so we don't need an extra check for a running
24713 - if (cpu_base->in_hrtirq)
24717 - * CLOCK_REALTIME timer might be requested with an absolute
24718 - * expiry time which is less than base->offset. Set it to 0.
24723 - if (expires >= cpu_base->expires_next)
24726 - /* Update the pointer to the next expiring timer */
24727 - cpu_base->next_timer = timer;
24730 - * If a hang was detected in the last timer interrupt then we
24731 - * do not schedule a timer which is earlier than the expiry
24732 - * which we enforced in the hang detection. We want the system
24733 - * to make progress.
24735 - if (cpu_base->hang_detected)
24737 +static bool hrtimer_hres_enabled __read_mostly = true;
24738 +unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
24739 +EXPORT_SYMBOL_GPL(hrtimer_resolution);
24742 - * Program the timer hardware. We enforce the expiry for
24743 - * events which are already in the past.
24745 - cpu_base->expires_next = expires;
24746 - tick_program_event(expires, 1);
24748 + * Enable / Disable high resolution mode
24750 +static int __init setup_hrtimer_hres(char *str)
24752 + return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
24755 +__setup("highres=", setup_hrtimer_hres);
24758 - * Initialize the high resolution related parts of cpu_base
24759 + * hrtimer_high_res_enabled - query, if the highres mode is enabled
24761 -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
24762 +static inline int hrtimer_is_hres_enabled(void)
24764 - base->expires_next = KTIME_MAX;
24765 - base->hang_detected = 0;
24766 - base->hres_active = 0;
24767 - base->next_timer = NULL;
24768 + return hrtimer_hres_enabled;
24772 @@ -669,7 +686,7 @@
24774 struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
24776 - if (!base->hres_active)
24777 + if (!__hrtimer_hres_active(base))
24780 raw_spin_lock(&base->lock);
24781 @@ -698,6 +715,29 @@
24782 retrigger_next_event(NULL);
24785 +#ifdef CONFIG_PREEMPT_RT_FULL
24787 +static struct swork_event clock_set_delay_work;
24789 +static void run_clock_set_delay(struct swork_event *event)
24794 +void clock_was_set_delayed(void)
24796 + swork_queue(&clock_set_delay_work);
24799 +static __init int create_clock_set_delay_thread(void)
24801 + WARN_ON(swork_get());
24802 + INIT_SWORK(&clock_set_delay_work, run_clock_set_delay);
24805 +early_initcall(create_clock_set_delay_thread);
24806 +#else /* PREEMPT_RT_FULL */
24808 static void clock_was_set_work(struct work_struct *work)
24811 @@ -713,26 +753,106 @@
24813 schedule_work(&hrtimer_work);
24819 -static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
24820 -static inline int hrtimer_hres_active(void) { return 0; }
24821 static inline int hrtimer_is_hres_enabled(void) { return 0; }
24822 static inline void hrtimer_switch_to_hres(void) { }
24823 -static inline void
24824 -hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
24825 -static inline int hrtimer_reprogram(struct hrtimer *timer,
24826 - struct hrtimer_clock_base *base)
24830 -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
24831 static inline void retrigger_next_event(void *arg) { }
24833 #endif /* CONFIG_HIGH_RES_TIMERS */
24836 + * When a timer is enqueued and expires earlier than the already enqueued
24837 + * timers, we have to check, whether it expires earlier than the timer for
24838 + * which the clock event device was armed.
24840 + * Called with interrupts disabled and base->cpu_base.lock held
24842 +static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
24844 + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
24845 + struct hrtimer_clock_base *base = timer->base;
24846 + ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
24848 + WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
24851 + * CLOCK_REALTIME timer might be requested with an absolute
24852 + * expiry time which is less than base->offset. Set it to 0.
24857 + if (timer->is_soft) {
24859 + * soft hrtimer could be started on a remote CPU. In this
24860 + * case softirq_expires_next needs to be updated on the
24861 + * remote CPU. The soft hrtimer will not expire before the
24862 + * first hard hrtimer on the remote CPU -
24863 + * hrtimer_check_target() prevents this case.
24865 + struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;
24867 + if (timer_cpu_base->softirq_activated)
24870 + if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
24873 + timer_cpu_base->softirq_next_timer = timer;
24874 + timer_cpu_base->softirq_expires_next = expires;
24876 + if (!ktime_before(expires, timer_cpu_base->expires_next) ||
24882 + * If the timer is not on the current cpu, we cannot reprogram
24883 + * the other cpus clock event device.
24885 + if (base->cpu_base != cpu_base)
24889 + * If the hrtimer interrupt is running, then it will
24890 + * reevaluate the clock bases and reprogram the clock event
24891 + * device. The callbacks are always executed in hard interrupt
24892 + * context so we don't need an extra check for a running
24895 + if (cpu_base->in_hrtirq)
24898 + if (expires >= cpu_base->expires_next)
24901 + /* Update the pointer to the next expiring timer */
24902 + cpu_base->next_timer = timer;
24903 + cpu_base->expires_next = expires;
24906 + * If hres is not active, hardware does not have to be
24907 + * programmed yet.
24909 + * If a hang was detected in the last timer interrupt then we
24910 + * do not schedule a timer which is earlier than the expiry
24911 + * which we enforced in the hang detection. We want the system
24912 + * to make progress.
24914 + if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
24918 + * Program the timer hardware. We enforce the expiry for
24919 + * events which are already in the past.
24921 + tick_program_event(expires, 1);
24925 * Clock realtime was set
24927 * Change the offset of the realtime clock vs. the monotonic
24928 @@ -830,6 +950,33 @@
24930 EXPORT_SYMBOL_GPL(hrtimer_forward);
24932 +#ifdef CONFIG_PREEMPT_RT_BASE
24933 +# define wake_up_timer_waiters(b) wake_up(&(b)->wait)
24936 + * hrtimer_wait_for_timer - Wait for a running timer
24938 + * @timer: timer to wait for
24940 + * The function waits in case the timers callback function is
24941 + * currently executed on the waitqueue of the timer base. The
24942 + * waitqueue is woken up after the timer callback function has
24943 + * finished execution.
24945 +void hrtimer_wait_for_timer(const struct hrtimer *timer)
24947 + struct hrtimer_clock_base *base = timer->base;
24949 + if (base && base->cpu_base &&
24950 + base->index >= HRTIMER_BASE_MONOTONIC_SOFT)
24951 + wait_event(base->cpu_base->wait,
24952 + !(hrtimer_callback_running(timer)));
24956 +# define wake_up_timer_waiters(b) do { } while (0)
24960 * enqueue_hrtimer - internal function to (re)start a timer
24962 @@ -839,9 +986,10 @@
24963 * Returns 1 when the new timer is the leftmost timer in the tree.
24965 static int enqueue_hrtimer(struct hrtimer *timer,
24966 - struct hrtimer_clock_base *base)
24967 + struct hrtimer_clock_base *base,
24968 + enum hrtimer_mode mode)
24970 - debug_activate(timer);
24971 + debug_activate(timer, mode);
24973 base->cpu_base->active_bases |= 1 << base->index;
24975 @@ -874,7 +1022,6 @@
24976 if (!timerqueue_del(&base->active, &timer->node))
24977 cpu_base->active_bases &= ~(1 << base->index);
24979 -#ifdef CONFIG_HIGH_RES_TIMERS
24981 * Note: If reprogram is false we do not update
24982 * cpu_base->next_timer. This happens when we remove the first
24983 @@ -885,7 +1032,6 @@
24985 if (reprogram && timer == cpu_base->next_timer)
24986 hrtimer_force_reprogram(cpu_base, 1);
24991 @@ -934,22 +1080,36 @@
24996 - * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
24997 - * @timer: the timer to be added
24998 - * @tim: expiry time
24999 - * @delta_ns: "slack" range for the timer
25000 - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
25001 - * relative (HRTIMER_MODE_REL)
25003 -void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25004 - u64 delta_ns, const enum hrtimer_mode mode)
25006 +hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
25008 - struct hrtimer_clock_base *base, *new_base;
25009 - unsigned long flags;
25013 - base = lock_hrtimer_base(timer, &flags);
25015 + * Find the next SOFT expiration.
25017 + expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
25020 + * reprogramming needs to be triggered, even if the next soft
25021 + * hrtimer expires at the same time than the next hard
25022 + * hrtimer. cpu_base->softirq_expires_next needs to be updated!
25024 + if (expires == KTIME_MAX)
25028 + * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
25029 + * cpu_base->*expires_next is only set by hrtimer_reprogram()
25031 + hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
25034 +static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25035 + u64 delta_ns, const enum hrtimer_mode mode,
25036 + struct hrtimer_clock_base *base)
25038 + struct hrtimer_clock_base *new_base;
25040 /* Remove an active timer from the queue: */
25041 remove_hrtimer(timer, base, true);
25042 @@ -964,21 +1124,37 @@
25043 /* Switch the timer base, if necessary: */
25044 new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
25046 - leftmost = enqueue_hrtimer(timer, new_base);
25049 + return enqueue_hrtimer(timer, new_base, mode);
25053 + * hrtimer_start_range_ns - (re)start an hrtimer
25054 + * @timer: the timer to be added
25055 + * @tim: expiry time
25056 + * @delta_ns: "slack" range for the timer
25057 + * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or
25058 + * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
25059 + * softirq based mode is considered for debug purpose only!
25061 +void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25062 + u64 delta_ns, const enum hrtimer_mode mode)
25064 + struct hrtimer_clock_base *base;
25065 + unsigned long flags;
25068 + * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
25071 +#ifndef CONFIG_PREEMPT_RT_BASE
25072 + WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
25075 + base = lock_hrtimer_base(timer, &flags);
25077 + if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
25078 + hrtimer_reprogram(timer, true);
25080 - if (!hrtimer_is_hres_active(timer)) {
25082 - * Kick to reschedule the next tick to handle the new timer
25083 - * on dynticks target.
25085 - if (new_base->cpu_base->nohz_active)
25086 - wake_up_nohz_cpu(new_base->cpu_base->cpu);
25088 - hrtimer_reprogram(timer, new_base);
25091 unlock_hrtimer_base(timer, &flags);
25093 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
25094 @@ -1035,7 +1211,7 @@
25099 + hrtimer_wait_for_timer(timer);
25102 EXPORT_SYMBOL_GPL(hrtimer_cancel);
25103 @@ -1076,7 +1252,7 @@
25104 raw_spin_lock_irqsave(&cpu_base->lock, flags);
25106 if (!__hrtimer_hres_active(cpu_base))
25107 - expires = __hrtimer_get_next_event(cpu_base);
25108 + expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
25110 raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25112 @@ -1099,8 +1275,16 @@
25113 static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25114 enum hrtimer_mode mode)
25116 - struct hrtimer_cpu_base *cpu_base;
25119 + struct hrtimer_cpu_base *cpu_base;
25121 + softtimer = !!(mode & HRTIMER_MODE_SOFT);
25122 +#ifdef CONFIG_PREEMPT_RT_FULL
25123 + if (!softtimer && !(mode & HRTIMER_MODE_HARD))
25124 + softtimer = true;
25126 + base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
25128 memset(timer, 0, sizeof(struct hrtimer));
25130 @@ -1114,7 +1298,8 @@
25131 if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
25132 clock_id = CLOCK_MONOTONIC;
25134 - base = hrtimer_clockid_to_base(clock_id);
25135 + base += hrtimer_clockid_to_base(clock_id);
25136 + timer->is_soft = softtimer;
25137 timer->base = &cpu_base->clock_base[base];
25138 timerqueue_init(&timer->node);
25140 @@ -1123,7 +1308,13 @@
25141 * hrtimer_init - initialize a timer to the given clock
25142 * @timer: the timer to be initialized
25143 * @clock_id: the clock to be used
25144 - * @mode: timer mode abs/rel
25145 + * @mode: The modes which are relevant for intitialization:
25146 + * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
25147 + * HRTIMER_MODE_REL_SOFT
25149 + * The PINNED variants of the above can be handed in,
25150 + * but the PINNED bit is ignored as pinning happens
25151 + * when the hrtimer is started
25153 void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25154 enum hrtimer_mode mode)
25155 @@ -1142,19 +1333,19 @@
25157 bool hrtimer_active(const struct hrtimer *timer)
25159 - struct hrtimer_cpu_base *cpu_base;
25160 + struct hrtimer_clock_base *base;
25164 - cpu_base = READ_ONCE(timer->base->cpu_base);
25165 - seq = raw_read_seqcount_begin(&cpu_base->seq);
25166 + base = READ_ONCE(timer->base);
25167 + seq = raw_read_seqcount_begin(&base->seq);
25169 if (timer->state != HRTIMER_STATE_INACTIVE ||
25170 - cpu_base->running == timer)
25171 + base->running == timer)
25174 - } while (read_seqcount_retry(&cpu_base->seq, seq) ||
25175 - cpu_base != READ_ONCE(timer->base->cpu_base));
25176 + } while (read_seqcount_retry(&base->seq, seq) ||
25177 + base != READ_ONCE(timer->base));
25181 @@ -1180,7 +1371,8 @@
25183 static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
25184 struct hrtimer_clock_base *base,
25185 - struct hrtimer *timer, ktime_t *now)
25186 + struct hrtimer *timer, ktime_t *now,
25187 + unsigned long flags)
25189 enum hrtimer_restart (*fn)(struct hrtimer *);
25191 @@ -1188,16 +1380,16 @@
25192 lockdep_assert_held(&cpu_base->lock);
25194 debug_deactivate(timer);
25195 - cpu_base->running = timer;
25196 + base->running = timer;
25199 * Separate the ->running assignment from the ->state assignment.
25201 * As with a regular write barrier, this ensures the read side in
25202 - * hrtimer_active() cannot observe cpu_base->running == NULL &&
25203 + * hrtimer_active() cannot observe base->running == NULL &&
25204 * timer->state == INACTIVE.
25206 - raw_write_seqcount_barrier(&cpu_base->seq);
25207 + raw_write_seqcount_barrier(&base->seq);
25209 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
25210 fn = timer->function;
25211 @@ -1211,15 +1403,15 @@
25212 timer->is_rel = false;
25215 - * Because we run timers from hardirq context, there is no chance
25216 - * they get migrated to another cpu, therefore its safe to unlock
25217 - * the timer base.
25218 + * The timer is marked as running in the cpu base, so it is
25219 + * protected against migration to a different CPU even if the lock
25222 - raw_spin_unlock(&cpu_base->lock);
25223 + raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25224 trace_hrtimer_expire_entry(timer, now);
25225 restart = fn(timer);
25226 trace_hrtimer_expire_exit(timer);
25227 - raw_spin_lock(&cpu_base->lock);
25228 + raw_spin_lock_irq(&cpu_base->lock);
25231 * Note: We clear the running state after enqueue_hrtimer and
25232 @@ -1232,33 +1424,31 @@
25234 if (restart != HRTIMER_NORESTART &&
25235 !(timer->state & HRTIMER_STATE_ENQUEUED))
25236 - enqueue_hrtimer(timer, base);
25237 + enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
25240 * Separate the ->running assignment from the ->state assignment.
25242 * As with a regular write barrier, this ensures the read side in
25243 - * hrtimer_active() cannot observe cpu_base->running == NULL &&
25244 + * hrtimer_active() cannot observe base->running.timer == NULL &&
25245 * timer->state == INACTIVE.
25247 - raw_write_seqcount_barrier(&cpu_base->seq);
25248 + raw_write_seqcount_barrier(&base->seq);
25250 - WARN_ON_ONCE(cpu_base->running != timer);
25251 - cpu_base->running = NULL;
25252 + WARN_ON_ONCE(base->running != timer);
25253 + base->running = NULL;
25256 -static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25257 +static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
25258 + unsigned long flags, unsigned int active_mask)
25260 - struct hrtimer_clock_base *base = cpu_base->clock_base;
25261 - unsigned int active = cpu_base->active_bases;
25262 + struct hrtimer_clock_base *base;
25263 + unsigned int active = cpu_base->active_bases & active_mask;
25265 - for (; active; base++, active >>= 1) {
25266 + for_each_active_base(base, cpu_base, active) {
25267 struct timerqueue_node *node;
25270 - if (!(active & 0x01))
25273 basenow = ktime_add(now, base->offset);
25275 while ((node = timerqueue_getnext(&base->active))) {
25276 @@ -1281,11 +1471,29 @@
25277 if (basenow < hrtimer_get_softexpires_tv64(timer))
25280 - __run_hrtimer(cpu_base, base, timer, &basenow);
25281 + __run_hrtimer(cpu_base, base, timer, &basenow, flags);
25286 +static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
25288 + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
25289 + unsigned long flags;
25292 + raw_spin_lock_irqsave(&cpu_base->lock, flags);
25294 + now = hrtimer_update_base(cpu_base);
25295 + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
25297 + cpu_base->softirq_activated = 0;
25298 + hrtimer_update_softirq_timer(cpu_base, true);
25300 + raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25301 + wake_up_timer_waiters(cpu_base);
25304 #ifdef CONFIG_HIGH_RES_TIMERS
25307 @@ -1296,13 +1504,14 @@
25309 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
25310 ktime_t expires_next, now, entry_time, delta;
25311 + unsigned long flags;
25314 BUG_ON(!cpu_base->hres_active);
25315 cpu_base->nr_events++;
25316 dev->next_event = KTIME_MAX;
25318 - raw_spin_lock(&cpu_base->lock);
25319 + raw_spin_lock_irqsave(&cpu_base->lock, flags);
25320 entry_time = now = hrtimer_update_base(cpu_base);
25322 cpu_base->in_hrtirq = 1;
25323 @@ -1315,17 +1524,23 @@
25325 cpu_base->expires_next = KTIME_MAX;
25327 - __hrtimer_run_queues(cpu_base, now);
25328 + if (!ktime_before(now, cpu_base->softirq_expires_next)) {
25329 + cpu_base->softirq_expires_next = KTIME_MAX;
25330 + cpu_base->softirq_activated = 1;
25331 + raise_softirq_irqoff(HRTIMER_SOFTIRQ);
25334 + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
25336 /* Reevaluate the clock bases for the next expiry */
25337 - expires_next = __hrtimer_get_next_event(cpu_base);
25338 + expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
25340 * Store the new expiry value so the migration code can verify
25343 cpu_base->expires_next = expires_next;
25344 cpu_base->in_hrtirq = 0;
25345 - raw_spin_unlock(&cpu_base->lock);
25346 + raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25348 /* Reprogramming necessary ? */
25349 if (!tick_program_event(expires_next, 0)) {
25350 @@ -1346,7 +1561,7 @@
25351 * Acquire base lock for updating the offsets and retrieving
25352 * the current time.
25354 - raw_spin_lock(&cpu_base->lock);
25355 + raw_spin_lock_irqsave(&cpu_base->lock, flags);
25356 now = hrtimer_update_base(cpu_base);
25357 cpu_base->nr_retries++;
25359 @@ -1359,7 +1574,8 @@
25361 cpu_base->nr_hangs++;
25362 cpu_base->hang_detected = 1;
25363 - raw_spin_unlock(&cpu_base->lock);
25364 + raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25366 delta = ktime_sub(now, entry_time);
25367 if ((unsigned int)delta > cpu_base->max_hang_time)
25368 cpu_base->max_hang_time = (unsigned int) delta;
25369 @@ -1401,6 +1617,7 @@
25370 void hrtimer_run_queues(void)
25372 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
25373 + unsigned long flags;
25376 if (__hrtimer_hres_active(cpu_base))
25377 @@ -1418,10 +1635,17 @@
25381 - raw_spin_lock(&cpu_base->lock);
25382 + raw_spin_lock_irqsave(&cpu_base->lock, flags);
25383 now = hrtimer_update_base(cpu_base);
25384 - __hrtimer_run_queues(cpu_base, now);
25385 - raw_spin_unlock(&cpu_base->lock);
25387 + if (!ktime_before(now, cpu_base->softirq_expires_next)) {
25388 + cpu_base->softirq_expires_next = KTIME_MAX;
25389 + cpu_base->softirq_activated = 1;
25390 + raise_softirq_irqoff(HRTIMER_SOFTIRQ);
25393 + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
25394 + raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25398 @@ -1440,13 +1664,65 @@
25399 return HRTIMER_NORESTART;
25402 -void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
25403 +#ifdef CONFIG_PREEMPT_RT_FULL
25404 +static bool task_is_realtime(struct task_struct *tsk)
25406 + int policy = tsk->policy;
25408 + if (policy == SCHED_FIFO || policy == SCHED_RR)
25410 + if (policy == SCHED_DEADLINE)
25416 +static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
25417 + clockid_t clock_id,
25418 + enum hrtimer_mode mode,
25419 + struct task_struct *task)
25421 +#ifdef CONFIG_PREEMPT_RT_FULL
25422 + if (!(mode & (HRTIMER_MODE_SOFT | HRTIMER_MODE_HARD))) {
25423 + if (task_is_realtime(current) || system_state != SYSTEM_RUNNING)
25424 + mode |= HRTIMER_MODE_HARD;
25426 + mode |= HRTIMER_MODE_SOFT;
25429 + __hrtimer_init(&sl->timer, clock_id, mode);
25430 sl->timer.function = hrtimer_wakeup;
25435 + * hrtimer_init_sleeper - initialize sleeper to the given clock
25436 + * @sl: sleeper to be initialized
25437 + * @clock_id: the clock to be used
25438 + * @mode: timer mode abs/rel
25439 + * @task: the task to wake up
25441 +void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
25442 + enum hrtimer_mode mode, struct task_struct *task)
25444 + debug_init(&sl->timer, clock_id, mode);
25445 + __hrtimer_init_sleeper(sl, clock_id, mode, task);
25448 EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
25450 +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
25451 +void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
25452 + clockid_t clock_id,
25453 + enum hrtimer_mode mode,
25454 + struct task_struct *task)
25456 + debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr);
25457 + __hrtimer_init_sleeper(sl, clock_id, mode, task);
25459 +EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);
25462 int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
25464 switch(restart->nanosleep.type) {
25465 @@ -1470,8 +1746,6 @@
25467 struct restart_block *restart;
25469 - hrtimer_init_sleeper(t, current);
25472 set_current_state(TASK_INTERRUPTIBLE);
25473 hrtimer_start_expires(&t->timer, mode);
25474 @@ -1508,10 +1782,9 @@
25475 struct hrtimer_sleeper t;
25478 - hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
25479 - HRTIMER_MODE_ABS);
25480 + hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid,
25481 + HRTIMER_MODE_ABS, current);
25482 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
25484 ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
25485 destroy_hrtimer_on_stack(&t.timer);
25487 @@ -1529,7 +1802,7 @@
25488 if (dl_task(current) || rt_task(current))
25491 - hrtimer_init_on_stack(&t.timer, clockid, mode);
25492 + hrtimer_init_sleeper_on_stack(&t, clockid, mode, current);
25493 hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
25494 ret = do_nanosleep(&t, mode);
25495 if (ret != -ERESTART_RESTARTBLOCK)
25496 @@ -1585,6 +1858,27 @@
25500 +#ifdef CONFIG_PREEMPT_RT_FULL
25502 + * Sleep for 1 ms in hope whoever holds what we want will let it go.
25504 +void cpu_chill(void)
25506 + ktime_t chill_time;
25507 + unsigned int freeze_flag = current->flags & PF_NOFREEZE;
25509 + chill_time = ktime_set(0, NSEC_PER_MSEC);
25510 + set_current_state(TASK_UNINTERRUPTIBLE);
25511 + current->flags |= PF_NOFREEZE;
25512 + sleeping_lock_inc();
25513 + schedule_hrtimeout(&chill_time, HRTIMER_MODE_REL_HARD);
25514 + sleeping_lock_dec();
25515 + if (!freeze_flag)
25516 + current->flags &= ~PF_NOFREEZE;
25518 +EXPORT_SYMBOL(cpu_chill);
25522 * Functions related to boot-time initialization:
25524 @@ -1598,9 +1892,17 @@
25525 timerqueue_init_head(&cpu_base->clock_base[i].active);
25528 - cpu_base->active_bases = 0;
25529 cpu_base->cpu = cpu;
25530 - hrtimer_init_hres(cpu_base);
25531 + cpu_base->active_bases = 0;
25532 + cpu_base->hres_active = 0;
25533 + cpu_base->hang_detected = 0;
25534 + cpu_base->next_timer = NULL;
25535 + cpu_base->softirq_next_timer = NULL;
25536 + cpu_base->expires_next = KTIME_MAX;
25537 + cpu_base->softirq_expires_next = KTIME_MAX;
25538 +#ifdef CONFIG_PREEMPT_RT_BASE
25539 + init_waitqueue_head(&cpu_base->wait);
25544 @@ -1632,7 +1934,7 @@
25545 * sort out already expired timers and reprogram the
25548 - enqueue_hrtimer(timer, new_base);
25549 + enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
25553 @@ -1644,6 +1946,12 @@
25554 BUG_ON(cpu_online(scpu));
25555 tick_cancel_sched_timer(scpu);
25558 + * this BH disable ensures that raise_softirq_irqoff() does
25559 + * not wakeup ksoftirqd (and acquire the pi-lock) while
25560 + * holding the cpu_base lock
25562 + local_bh_disable();
25563 local_irq_disable();
25564 old_base = &per_cpu(hrtimer_bases, scpu);
25565 new_base = this_cpu_ptr(&hrtimer_bases);
25566 @@ -1659,12 +1967,19 @@
25567 &new_base->clock_base[i]);
25571 + * The migration might have changed the first expiring softirq
25572 + * timer on this CPU. Update it.
25574 + hrtimer_update_softirq_timer(new_base, false);
25576 raw_spin_unlock(&old_base->lock);
25577 raw_spin_unlock(&new_base->lock);
25579 /* Check, if we got expired work to do */
25580 __hrtimer_peek_ahead_timers();
25581 local_irq_enable();
25582 + local_bh_enable();
25586 @@ -1673,18 +1988,19 @@
25587 void __init hrtimers_init(void)
25589 hrtimers_prepare_cpu(smp_processor_id());
25590 + open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
25594 * schedule_hrtimeout_range_clock - sleep until timeout
25595 * @expires: timeout value (ktime_t)
25596 * @delta: slack in expires timeout (ktime_t)
25597 - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
25598 - * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
25599 + * @mode: timer mode
25600 + * @clock_id: timer clock to be used
25603 schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
25604 - const enum hrtimer_mode mode, int clock)
25605 + const enum hrtimer_mode mode, clockid_t clock_id)
25607 struct hrtimer_sleeper t;
25609 @@ -1705,11 +2021,9 @@
25613 - hrtimer_init_on_stack(&t.timer, clock, mode);
25614 + hrtimer_init_sleeper_on_stack(&t, clock_id, mode, current);
25615 hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
25617 - hrtimer_init_sleeper(&t, current);
25619 hrtimer_start_expires(&t.timer, mode);
25621 if (likely(t.task))
25622 @@ -1727,7 +2041,7 @@
25623 * schedule_hrtimeout_range - sleep until timeout
25624 * @expires: timeout value (ktime_t)
25625 * @delta: slack in expires timeout (ktime_t)
25626 - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
25627 + * @mode: timer mode
25629 * Make the current task sleep until the given expiry time has
25630 * elapsed. The routine will return immediately unless
25631 @@ -1766,7 +2080,7 @@
25633 * schedule_hrtimeout - sleep until timeout
25634 * @expires: timeout value (ktime_t)
25635 - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
25636 + * @mode: timer mode
25638 * Make the current task sleep until the given expiry time has
25639 * elapsed. The routine will return immediately unless
25640 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/itimer.c linux-4.14/kernel/time/itimer.c
25641 --- linux-4.14.orig/kernel/time/itimer.c 2017-11-12 19:46:13.000000000 +0100
25642 +++ linux-4.14/kernel/time/itimer.c 2018-09-05 11:05:07.000000000 +0200
25643 @@ -214,6 +214,7 @@
25644 /* We are sharing ->siglock with it_real_fn() */
25645 if (hrtimer_try_to_cancel(timer) < 0) {
25646 spin_unlock_irq(&tsk->sighand->siglock);
25647 + hrtimer_wait_for_timer(&tsk->signal->real_timer);
25650 expires = timeval_to_ktime(value->it_value);
25651 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/jiffies.c linux-4.14/kernel/time/jiffies.c
25652 --- linux-4.14.orig/kernel/time/jiffies.c 2017-11-12 19:46:13.000000000 +0100
25653 +++ linux-4.14/kernel/time/jiffies.c 2018-09-05 11:05:07.000000000 +0200
25658 -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
25659 +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
25660 +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
25662 #if (BITS_PER_LONG < 64)
25663 u64 get_jiffies_64(void)
25668 - seq = read_seqbegin(&jiffies_lock);
25669 + seq = read_seqcount_begin(&jiffies_seq);
25671 - } while (read_seqretry(&jiffies_lock, seq));
25672 + } while (read_seqcount_retry(&jiffies_seq, seq));
25675 EXPORT_SYMBOL(get_jiffies_64);
25676 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/posix-cpu-timers.c linux-4.14/kernel/time/posix-cpu-timers.c
25677 --- linux-4.14.orig/kernel/time/posix-cpu-timers.c 2017-11-12 19:46:13.000000000 +0100
25678 +++ linux-4.14/kernel/time/posix-cpu-timers.c 2018-09-05 11:05:07.000000000 +0200
25680 * Implement CPU time clocks for the POSIX clock interface.
25683 +#include <uapi/linux/sched/types.h>
25684 #include <linux/sched/signal.h>
25685 #include <linux/sched/cputime.h>
25686 +#include <linux/sched/rt.h>
25687 #include <linux/posix-timers.h>
25688 #include <linux/errno.h>
25689 #include <linux/math64.h>
25691 #include <linux/tick.h>
25692 #include <linux/workqueue.h>
25693 #include <linux/compat.h>
25694 +#include <linux/smpboot.h>
25696 #include "posix-timers.h"
25698 @@ -603,7 +606,7 @@
25700 * Disarm any old timer after extracting its expiry time.
25702 - WARN_ON_ONCE(!irqs_disabled());
25703 + WARN_ON_ONCE_NONRT(!irqs_disabled());
25706 old_incr = timer->it.cpu.incr;
25707 @@ -1034,7 +1037,7 @@
25709 * Now re-arm for the new expiry time.
25711 - WARN_ON_ONCE(!irqs_disabled());
25712 + WARN_ON_ONCE_NONRT(!irqs_disabled());
25715 unlock_task_sighand(p, &flags);
25716 @@ -1119,13 +1122,13 @@
25717 * already updated our counts. We need to check if any timers fire now.
25718 * Interrupts are disabled.
25720 -void run_posix_cpu_timers(struct task_struct *tsk)
25721 +static void __run_posix_cpu_timers(struct task_struct *tsk)
25724 struct k_itimer *timer, *next;
25725 unsigned long flags;
25727 - WARN_ON_ONCE(!irqs_disabled());
25728 + WARN_ON_ONCE_NONRT(!irqs_disabled());
25731 * The fast path checks that there are no expired thread or thread
25732 @@ -1179,6 +1182,152 @@
25736 +#ifdef CONFIG_PREEMPT_RT_BASE
25737 +#include <linux/kthread.h>
25738 +#include <linux/cpu.h>
25739 +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
25740 +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
25741 +DEFINE_PER_CPU(bool, posix_timer_th_active);
25743 +static void posix_cpu_kthread_fn(unsigned int cpu)
25745 + struct task_struct *tsk = NULL;
25746 + struct task_struct *next = NULL;
25748 + BUG_ON(per_cpu(posix_timer_task, cpu) != current);
25750 + /* grab task list */
25751 + raw_local_irq_disable();
25752 + tsk = per_cpu(posix_timer_tasklist, cpu);
25753 + per_cpu(posix_timer_tasklist, cpu) = NULL;
25754 + raw_local_irq_enable();
25756 + /* its possible the list is empty, just return */
25760 + /* Process task list */
25763 + next = tsk->posix_timer_list;
25765 + /* run the task timers, clear its ptr and
25768 + __run_posix_cpu_timers(tsk);
25769 + tsk->posix_timer_list = NULL;
25770 + put_task_struct(tsk);
25772 + /* check if this is the last on the list */
25779 +static inline int __fastpath_timer_check(struct task_struct *tsk)
25781 + /* tsk == current, ensure it is safe to use ->signal/sighand */
25782 + if (unlikely(tsk->exit_state))
25785 + if (!task_cputime_zero(&tsk->cputime_expires))
25788 + if (!task_cputime_zero(&tsk->signal->cputime_expires))
25794 +void run_posix_cpu_timers(struct task_struct *tsk)
25796 + unsigned int cpu = smp_processor_id();
25797 + struct task_struct *tasklist;
25799 + BUG_ON(!irqs_disabled());
25801 + if (per_cpu(posix_timer_th_active, cpu) != true)
25804 + /* get per-cpu references */
25805 + tasklist = per_cpu(posix_timer_tasklist, cpu);
25807 + /* check to see if we're already queued */
25808 + if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
25809 + get_task_struct(tsk);
25811 + tsk->posix_timer_list = tasklist;
25814 + * The list is terminated by a self-pointing
25817 + tsk->posix_timer_list = tsk;
25819 + per_cpu(posix_timer_tasklist, cpu) = tsk;
25821 + wake_up_process(per_cpu(posix_timer_task, cpu));
25825 +static int posix_cpu_kthread_should_run(unsigned int cpu)
25827 + return __this_cpu_read(posix_timer_tasklist) != NULL;
25830 +static void posix_cpu_kthread_park(unsigned int cpu)
25832 + this_cpu_write(posix_timer_th_active, false);
25835 +static void posix_cpu_kthread_unpark(unsigned int cpu)
25837 + this_cpu_write(posix_timer_th_active, true);
25840 +static void posix_cpu_kthread_setup(unsigned int cpu)
25842 + struct sched_param sp;
25844 + sp.sched_priority = MAX_RT_PRIO - 1;
25845 + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
25846 + posix_cpu_kthread_unpark(cpu);
25849 +static struct smp_hotplug_thread posix_cpu_thread = {
25850 + .store = &posix_timer_task,
25851 + .thread_should_run = posix_cpu_kthread_should_run,
25852 + .thread_fn = posix_cpu_kthread_fn,
25853 + .thread_comm = "posixcputmr/%u",
25854 + .setup = posix_cpu_kthread_setup,
25855 + .park = posix_cpu_kthread_park,
25856 + .unpark = posix_cpu_kthread_unpark,
25859 +static int __init posix_cpu_thread_init(void)
25861 + /* Start one for boot CPU. */
25862 + unsigned long cpu;
25865 + /* init the per-cpu posix_timer_tasklets */
25866 + for_each_possible_cpu(cpu)
25867 + per_cpu(posix_timer_tasklist, cpu) = NULL;
25869 + ret = smpboot_register_percpu_thread(&posix_cpu_thread);
25874 +early_initcall(posix_cpu_thread_init);
25875 +#else /* CONFIG_PREEMPT_RT_BASE */
25876 +void run_posix_cpu_timers(struct task_struct *tsk)
25878 + __run_posix_cpu_timers(tsk);
25880 +#endif /* CONFIG_PREEMPT_RT_BASE */
25883 * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
25884 * The tsk->sighand->siglock must be held by the caller.
25885 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/posix-timers.c linux-4.14/kernel/time/posix-timers.c
25886 --- linux-4.14.orig/kernel/time/posix-timers.c 2018-09-05 11:03:22.000000000 +0200
25887 +++ linux-4.14/kernel/time/posix-timers.c 2018-09-05 11:05:07.000000000 +0200
25888 @@ -434,6 +434,7 @@
25889 static struct pid *good_sigevent(sigevent_t * event)
25891 struct task_struct *rtn = current->group_leader;
25892 + int sig = event->sigev_signo;
25894 switch (event->sigev_notify) {
25895 case SIGEV_SIGNAL | SIGEV_THREAD_ID:
25896 @@ -443,7 +444,8 @@
25900 - if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX)
25901 + if (sig <= 0 || sig > SIGRTMAX ||
25902 + sig_kernel_only(sig) || sig_kernel_coredump(sig))
25906 @@ -469,7 +471,7 @@
25908 static void k_itimer_rcu_free(struct rcu_head *head)
25910 - struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
25911 + struct k_itimer *tmr = container_of(head, struct k_itimer, rcu);
25913 kmem_cache_free(posix_timers_cache, tmr);
25915 @@ -486,7 +488,7 @@
25917 put_pid(tmr->it_pid);
25918 sigqueue_free(tmr->sigq);
25919 - call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
25920 + call_rcu(&tmr->rcu, k_itimer_rcu_free);
25923 static int common_timer_create(struct k_itimer *new_timer)
25924 @@ -825,6 +827,22 @@
25925 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
25929 + * Protected by RCU!
25931 +static void timer_wait_for_callback(const struct k_clock *kc, struct k_itimer *timr)
25933 +#ifdef CONFIG_PREEMPT_RT_FULL
25934 + if (kc->timer_arm == common_hrtimer_arm)
25935 + hrtimer_wait_for_timer(&timr->it.real.timer);
25936 + else if (kc == &alarm_clock)
25937 + hrtimer_wait_for_timer(&timr->it.alarm.alarmtimer.timer);
25939 + /* FIXME: Whacky hack for posix-cpu-timers */
25940 + schedule_timeout(1);
25944 static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
25946 return hrtimer_try_to_cancel(&timr->it.real.timer);
25947 @@ -889,6 +907,7 @@
25953 if (WARN_ON_ONCE(!kc || !kc->timer_set))
25955 @@ -897,9 +916,12 @@
25957 unlock_timer(timr, flag);
25958 if (error == TIMER_RETRY) {
25959 + timer_wait_for_callback(kc, timr);
25960 old_spec64 = NULL; // We already got the old time...
25961 + rcu_read_unlock();
25964 + rcu_read_unlock();
25968 @@ -981,10 +1003,15 @@
25973 if (timer_delete_hook(timer) == TIMER_RETRY) {
25974 unlock_timer(timer, flags);
25975 + timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
25977 + rcu_read_unlock();
25980 + rcu_read_unlock();
25982 spin_lock(¤t->sighand->siglock);
25983 list_del(&timer->list);
25984 @@ -1010,8 +1037,18 @@
25986 spin_lock_irqsave(&timer->it_lock, flags);
25988 + /* On RT we can race with a deletion */
25989 + if (!timer->it_signal) {
25990 + unlock_timer(timer, flags);
25994 if (timer_delete_hook(timer) == TIMER_RETRY) {
25996 unlock_timer(timer, flags);
25997 + timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
25999 + rcu_read_unlock();
26002 list_del(&timer->list);
26003 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/tick-broadcast-hrtimer.c linux-4.14/kernel/time/tick-broadcast-hrtimer.c
26004 --- linux-4.14.orig/kernel/time/tick-broadcast-hrtimer.c 2017-11-12 19:46:13.000000000 +0100
26005 +++ linux-4.14/kernel/time/tick-broadcast-hrtimer.c 2018-09-05 11:05:07.000000000 +0200
26006 @@ -106,7 +106,7 @@
26008 void tick_setup_hrtimer_broadcast(void)
26010 - hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
26011 + hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
26012 bctimer.function = bc_handler;
26013 clockevents_register_device(&ce_broadcast_hrtimer);
26015 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/tick-common.c linux-4.14/kernel/time/tick-common.c
26016 --- linux-4.14.orig/kernel/time/tick-common.c 2017-11-12 19:46:13.000000000 +0100
26017 +++ linux-4.14/kernel/time/tick-common.c 2018-09-05 11:05:07.000000000 +0200
26018 @@ -79,13 +79,15 @@
26019 static void tick_periodic(int cpu)
26021 if (tick_do_timer_cpu == cpu) {
26022 - write_seqlock(&jiffies_lock);
26023 + raw_spin_lock(&jiffies_lock);
26024 + write_seqcount_begin(&jiffies_seq);
26026 /* Keep track of the next tick event */
26027 tick_next_period = ktime_add(tick_next_period, tick_period);
26030 - write_sequnlock(&jiffies_lock);
26031 + write_seqcount_end(&jiffies_seq);
26032 + raw_spin_unlock(&jiffies_lock);
26033 update_wall_time();
26036 @@ -157,9 +159,9 @@
26040 - seq = read_seqbegin(&jiffies_lock);
26041 + seq = read_seqcount_begin(&jiffies_seq);
26042 next = tick_next_period;
26043 - } while (read_seqretry(&jiffies_lock, seq));
26044 + } while (read_seqcount_retry(&jiffies_seq, seq));
26046 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
26048 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/tick-internal.h linux-4.14/kernel/time/tick-internal.h
26049 --- linux-4.14.orig/kernel/time/tick-internal.h 2017-11-12 19:46:13.000000000 +0100
26050 +++ linux-4.14/kernel/time/tick-internal.h 2018-09-05 11:05:07.000000000 +0200
26051 @@ -150,16 +150,15 @@
26053 #ifdef CONFIG_NO_HZ_COMMON
26054 extern unsigned long tick_nohz_active;
26056 +extern void timers_update_nohz(void);
26057 +# ifdef CONFIG_SMP
26058 +extern struct static_key_false timers_migration_enabled;
26060 +#else /* CONFIG_NO_HZ_COMMON */
26061 +static inline void timers_update_nohz(void) { }
26062 #define tick_nohz_active (0)
26065 -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
26066 -extern void timers_update_migration(bool update_nohz);
26068 -static inline void timers_update_migration(bool update_nohz) { }
26071 DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
26073 extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
26074 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/tick-sched.c linux-4.14/kernel/time/tick-sched.c
26075 --- linux-4.14.orig/kernel/time/tick-sched.c 2018-09-05 11:03:22.000000000 +0200
26076 +++ linux-4.14/kernel/time/tick-sched.c 2018-09-05 11:05:07.000000000 +0200
26080 /* Reevaluate with jiffies_lock held */
26081 - write_seqlock(&jiffies_lock);
26082 + raw_spin_lock(&jiffies_lock);
26083 + write_seqcount_begin(&jiffies_seq);
26085 delta = ktime_sub(now, last_jiffies_update);
26086 if (delta >= tick_period) {
26087 @@ -89,10 +90,12 @@
26088 /* Keep the tick_next_period variable up to date */
26089 tick_next_period = ktime_add(last_jiffies_update, tick_period);
26091 - write_sequnlock(&jiffies_lock);
26092 + write_seqcount_end(&jiffies_seq);
26093 + raw_spin_unlock(&jiffies_lock);
26096 - write_sequnlock(&jiffies_lock);
26097 + write_seqcount_end(&jiffies_seq);
26098 + raw_spin_unlock(&jiffies_lock);
26099 update_wall_time();
26102 @@ -103,12 +106,14 @@
26106 - write_seqlock(&jiffies_lock);
26107 + raw_spin_lock(&jiffies_lock);
26108 + write_seqcount_begin(&jiffies_seq);
26109 /* Did we start the jiffies update yet ? */
26110 if (last_jiffies_update == 0)
26111 last_jiffies_update = tick_next_period;
26112 period = last_jiffies_update;
26113 - write_sequnlock(&jiffies_lock);
26114 + write_seqcount_end(&jiffies_seq);
26115 + raw_spin_unlock(&jiffies_lock);
26119 @@ -225,6 +230,7 @@
26121 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
26122 .func = nohz_full_kick_func,
26123 + .flags = IRQ_WORK_HARD_IRQ,
26127 @@ -689,10 +695,10 @@
26129 /* Read jiffies and the time when jiffies were updated last */
26131 - seq = read_seqbegin(&jiffies_lock);
26132 + seq = read_seqcount_begin(&jiffies_seq);
26133 basemono = last_jiffies_update;
26134 basejiff = jiffies;
26135 - } while (read_seqretry(&jiffies_lock, seq));
26136 + } while (read_seqcount_retry(&jiffies_seq, seq));
26137 ts->last_jiffies = basejiff;
26140 @@ -906,14 +912,7 @@
26143 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
26144 - static int ratelimit;
26146 - if (ratelimit < 10 &&
26147 - (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
26148 - pr_warn("NOHZ: local_softirq_pending %02x\n",
26149 - (unsigned int) local_softirq_pending());
26152 + softirq_check_pending_idle();
26156 @@ -1132,7 +1131,7 @@
26157 ts->nohz_mode = mode;
26158 /* One update is enough */
26159 if (!test_and_set_bit(0, &tick_nohz_active))
26160 - timers_update_migration(true);
26161 + timers_update_nohz();
26165 @@ -1250,7 +1249,7 @@
26167 * Emulate tick processing via per-CPU hrtimers:
26169 - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
26170 + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
26171 ts->sched_timer.function = tick_sched_timer;
26173 /* Get the next period (per-CPU) */
26174 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/timekeeping.c linux-4.14/kernel/time/timekeeping.c
26175 --- linux-4.14.orig/kernel/time/timekeeping.c 2017-11-12 19:46:13.000000000 +0100
26176 +++ linux-4.14/kernel/time/timekeeping.c 2018-09-05 11:05:07.000000000 +0200
26177 @@ -2326,8 +2326,10 @@
26179 void xtime_update(unsigned long ticks)
26181 - write_seqlock(&jiffies_lock);
26182 + raw_spin_lock(&jiffies_lock);
26183 + write_seqcount_begin(&jiffies_seq);
26185 - write_sequnlock(&jiffies_lock);
26186 + write_seqcount_end(&jiffies_seq);
26187 + raw_spin_unlock(&jiffies_lock);
26188 update_wall_time();
26190 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/timekeeping.h linux-4.14/kernel/time/timekeeping.h
26191 --- linux-4.14.orig/kernel/time/timekeeping.h 2017-11-12 19:46:13.000000000 +0100
26192 +++ linux-4.14/kernel/time/timekeeping.h 2018-09-05 11:05:07.000000000 +0200
26194 extern void do_timer(unsigned long ticks);
26195 extern void update_wall_time(void);
26197 -extern seqlock_t jiffies_lock;
26198 +extern raw_spinlock_t jiffies_lock;
26199 +extern seqcount_t jiffies_seq;
26201 #define CS_NAME_LEN 32
26203 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/timer.c linux-4.14/kernel/time/timer.c
26204 --- linux-4.14.orig/kernel/time/timer.c 2018-09-05 11:03:22.000000000 +0200
26205 +++ linux-4.14/kernel/time/timer.c 2018-09-05 11:05:07.000000000 +0200
26207 #include <linux/sched/debug.h>
26208 #include <linux/slab.h>
26209 #include <linux/compat.h>
26210 +#include <linux/swait.h>
26212 #include <linux/uaccess.h>
26213 #include <asm/unistd.h>
26214 @@ -197,11 +198,12 @@
26215 struct timer_base {
26216 raw_spinlock_t lock;
26217 struct timer_list *running_timer;
26218 +#ifdef CONFIG_PREEMPT_RT_FULL
26219 + struct swait_queue_head wait_for_running_timer;
26222 unsigned long next_expiry;
26224 - bool migration_enabled;
26225 - bool nohz_active;
26227 bool must_forward_clk;
26228 DECLARE_BITMAP(pending_map, WHEEL_SIZE);
26229 @@ -210,45 +212,73 @@
26231 static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
26233 -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
26234 +#ifdef CONFIG_NO_HZ_COMMON
26236 +static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
26237 +static DEFINE_MUTEX(timer_keys_mutex);
26239 +static struct swork_event timer_update_swork;
26242 unsigned int sysctl_timer_migration = 1;
26244 -void timers_update_migration(bool update_nohz)
26245 +DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
26247 +static void timers_update_migration(void)
26249 bool on = sysctl_timer_migration && tick_nohz_active;
26250 - unsigned int cpu;
26252 - /* Avoid the loop, if nothing to update */
26253 - if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)
26256 + static_branch_enable(&timers_migration_enabled);
26258 + static_branch_disable(&timers_migration_enabled);
26261 +static inline void timers_update_migration(void) { }
26262 +#endif /* !CONFIG_SMP */
26264 - for_each_possible_cpu(cpu) {
26265 - per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on;
26266 - per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;
26267 - per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
26268 - if (!update_nohz)
26270 - per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true;
26271 - per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;
26272 - per_cpu(hrtimer_bases.nohz_active, cpu) = true;
26274 +static void timer_update_keys(struct swork_event *event)
26276 + mutex_lock(&timer_keys_mutex);
26277 + timers_update_migration();
26278 + static_branch_enable(&timers_nohz_active);
26279 + mutex_unlock(&timer_keys_mutex);
26282 +void timers_update_nohz(void)
26284 + swork_queue(&timer_update_swork);
26287 +static __init int hrtimer_init_thread(void)
26289 + WARN_ON(swork_get());
26290 + INIT_SWORK(&timer_update_swork, timer_update_keys);
26293 +early_initcall(hrtimer_init_thread);
26295 int timer_migration_handler(struct ctl_table *table, int write,
26296 void __user *buffer, size_t *lenp,
26299 - static DEFINE_MUTEX(mutex);
26302 - mutex_lock(&mutex);
26303 + mutex_lock(&timer_keys_mutex);
26304 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
26306 - timers_update_migration(false);
26307 - mutex_unlock(&mutex);
26308 + timers_update_migration();
26309 + mutex_unlock(&timer_keys_mutex);
26314 +static inline bool is_timers_nohz_active(void)
26316 + return static_branch_unlikely(&timers_nohz_active);
26319 +static inline bool is_timers_nohz_active(void) { return false; }
26320 +#endif /* NO_HZ_COMMON */
26322 static unsigned long round_jiffies_common(unsigned long j, int cpu,
26324 @@ -534,7 +564,7 @@
26326 trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
26328 - if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
26329 + if (!is_timers_nohz_active())
26333 @@ -840,21 +870,20 @@
26334 return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
26337 -#ifdef CONFIG_NO_HZ_COMMON
26338 static inline struct timer_base *
26339 get_target_base(struct timer_base *base, unsigned tflags)
26342 - if ((tflags & TIMER_PINNED) || !base->migration_enabled)
26343 - return get_timer_this_cpu_base(tflags);
26344 - return get_timer_cpu_base(tflags, get_nohz_timer_target());
26346 - return get_timer_this_cpu_base(tflags);
26347 +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
26348 + if (static_branch_unlikely(&timers_migration_enabled) &&
26349 + !(tflags & TIMER_PINNED))
26350 + return get_timer_cpu_base(tflags, get_nohz_timer_target());
26352 + return get_timer_this_cpu_base(tflags);
26355 static inline void forward_timer_base(struct timer_base *base)
26357 +#ifdef CONFIG_NO_HZ_COMMON
26358 unsigned long jnow;
26361 @@ -878,16 +907,8 @@
26364 base->clk = base->next_expiry;
26367 -static inline struct timer_base *
26368 -get_target_base(struct timer_base *base, unsigned tflags)
26370 - return get_timer_this_cpu_base(tflags);
26373 -static inline void forward_timer_base(struct timer_base *base) { }
26379 @@ -1130,6 +1151,33 @@
26381 EXPORT_SYMBOL_GPL(add_timer_on);
26383 +#ifdef CONFIG_PREEMPT_RT_FULL
26385 + * Wait for a running timer
26387 +static void wait_for_running_timer(struct timer_list *timer)
26389 + struct timer_base *base;
26390 + u32 tf = timer->flags;
26392 + if (tf & TIMER_MIGRATING)
26395 + base = get_timer_base(tf);
26396 + swait_event(base->wait_for_running_timer,
26397 + base->running_timer != timer);
26400 +# define wakeup_timer_waiters(b) swake_up_all(&(b)->wait_for_running_timer)
26402 +static inline void wait_for_running_timer(struct timer_list *timer)
26407 +# define wakeup_timer_waiters(b) do { } while (0)
26411 * del_timer - deactivate a timer.
26412 * @timer: the timer to be deactivated
26413 @@ -1185,7 +1233,7 @@
26415 EXPORT_SYMBOL(try_to_del_timer_sync);
26418 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
26420 * del_timer_sync - deactivate a timer and wait for the handler to finish.
26421 * @timer: the timer to be deactivated
26422 @@ -1245,7 +1293,7 @@
26423 int ret = try_to_del_timer_sync(timer);
26427 + wait_for_running_timer(timer);
26430 EXPORT_SYMBOL(del_timer_sync);
26431 @@ -1309,13 +1357,16 @@
26432 fn = timer->function;
26433 data = timer->data;
26435 - if (timer->flags & TIMER_IRQSAFE) {
26436 + if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) &&
26437 + timer->flags & TIMER_IRQSAFE) {
26438 raw_spin_unlock(&base->lock);
26439 call_timer_fn(timer, fn, data);
26440 + base->running_timer = NULL;
26441 raw_spin_lock(&base->lock);
26443 raw_spin_unlock_irq(&base->lock);
26444 call_timer_fn(timer, fn, data);
26445 + base->running_timer = NULL;
26446 raw_spin_lock_irq(&base->lock);
26449 @@ -1584,13 +1635,13 @@
26451 /* Note: this timer irq context must be accounted for as well. */
26452 account_process_tick(p, user_tick);
26453 + scheduler_tick();
26454 run_local_timers();
26455 rcu_check_callbacks(user_tick);
26456 -#ifdef CONFIG_IRQ_WORK
26457 +#if defined(CONFIG_IRQ_WORK)
26461 - scheduler_tick();
26462 if (IS_ENABLED(CONFIG_POSIX_TIMERS))
26463 run_posix_cpu_timers(p);
26465 @@ -1617,8 +1668,8 @@
26467 expire_timers(base, heads + levels);
26469 - base->running_timer = NULL;
26470 raw_spin_unlock_irq(&base->lock);
26471 + wakeup_timer_waiters(base);
26475 @@ -1628,6 +1679,7 @@
26477 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
26479 + irq_work_tick_soft();
26481 * must_forward_clk must be cleared before running timers so that any
26482 * timer functions that call mod_timer will not try to forward the
26483 @@ -1864,6 +1916,9 @@
26485 raw_spin_lock_init(&base->lock);
26486 base->clk = jiffies;
26487 +#ifdef CONFIG_PREEMPT_RT_FULL
26488 + init_swait_queue_head(&base->wait_for_running_timer);
26493 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/Kconfig linux-4.14/kernel/trace/Kconfig
26494 --- linux-4.14.orig/kernel/trace/Kconfig 2018-09-05 11:03:22.000000000 +0200
26495 +++ linux-4.14/kernel/trace/Kconfig 2018-09-05 11:05:07.000000000 +0200
26496 @@ -585,7 +585,10 @@
26497 event activity as an initial guide for further investigation
26498 using more advanced tools.
26500 - See Documentation/trace/events.txt.
26501 + Inter-event tracing of quantities such as latencies is also
26502 + supported using hist triggers under this option.
26504 + See Documentation/trace/histogram.txt.
26505 If in doubt, say N.
26507 config MMIOTRACE_TEST
26508 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/ring_buffer.c linux-4.14/kernel/trace/ring_buffer.c
26509 --- linux-4.14.orig/kernel/trace/ring_buffer.c 2018-09-05 11:03:22.000000000 +0200
26510 +++ linux-4.14/kernel/trace/ring_buffer.c 2018-09-05 11:05:07.000000000 +0200
26512 RINGBUF_TYPE_PADDING);
26513 trace_seq_printf(s, "\ttime_extend : type == %d\n",
26514 RINGBUF_TYPE_TIME_EXTEND);
26515 + trace_seq_printf(s, "\ttime_stamp : type == %d\n",
26516 + RINGBUF_TYPE_TIME_STAMP);
26517 trace_seq_printf(s, "\tdata max type_len == %d\n",
26518 RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
26520 @@ -140,12 +142,15 @@
26523 RB_LEN_TIME_EXTEND = 8,
26524 - RB_LEN_TIME_STAMP = 16,
26525 + RB_LEN_TIME_STAMP = 8,
26528 #define skip_time_extend(event) \
26529 ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
26531 +#define extended_time(event) \
26532 + (event->type_len >= RINGBUF_TYPE_TIME_EXTEND)
26534 static inline int rb_null_event(struct ring_buffer_event *event)
26536 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
26537 @@ -209,7 +214,7 @@
26541 - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
26542 + if (extended_time(event)) {
26543 /* time extends include the data event after it */
26544 len = RB_LEN_TIME_EXTEND;
26545 event = skip_time_extend(event);
26546 @@ -231,7 +236,7 @@
26550 - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
26551 + if (extended_time(event))
26552 event = skip_time_extend(event);
26554 length = rb_event_length(event);
26555 @@ -248,7 +253,7 @@
26556 static __always_inline void *
26557 rb_event_data(struct ring_buffer_event *event)
26559 - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
26560 + if (extended_time(event))
26561 event = skip_time_extend(event);
26562 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
26563 /* If length is in len field, then array[0] has the data */
26564 @@ -275,6 +280,27 @@
26565 #define TS_MASK ((1ULL << TS_SHIFT) - 1)
26566 #define TS_DELTA_TEST (~TS_MASK)
26569 + * ring_buffer_event_time_stamp - return the event's extended timestamp
26570 + * @event: the event to get the timestamp of
26572 + * Returns the extended timestamp associated with a data event.
26573 + * An extended time_stamp is a 64-bit timestamp represented
26574 + * internally in a special way that makes the best use of space
26575 + * contained within a ring buffer event. This function decodes
26576 + * it and maps it to a straight u64 value.
26578 +u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event)
26582 + ts = event->array[0];
26584 + ts += event->time_delta;
26589 /* Flag when events were overwritten */
26590 #define RB_MISSED_EVENTS (1 << 31)
26591 /* Missed count stored at end */
26592 @@ -451,6 +477,7 @@
26593 struct buffer_page *reader_page;
26594 unsigned long lost_events;
26595 unsigned long last_overrun;
26596 + unsigned long nest;
26597 local_t entries_bytes;
26600 @@ -488,6 +515,7 @@
26601 u64 (*clock)(void);
26603 struct rb_irq_work irq_work;
26604 + bool time_stamp_abs;
26607 struct ring_buffer_iter {
26608 @@ -1387,6 +1415,16 @@
26609 buffer->clock = clock;
26612 +void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs)
26614 + buffer->time_stamp_abs = abs;
26617 +bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer)
26619 + return buffer->time_stamp_abs;
26622 static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
26624 static inline unsigned long rb_page_entries(struct buffer_page *bpage)
26625 @@ -2217,12 +2255,15 @@
26627 /* Slow path, do not inline */
26628 static noinline struct ring_buffer_event *
26629 -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
26630 +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
26632 - event->type_len = RINGBUF_TYPE_TIME_EXTEND;
26634 + event->type_len = RINGBUF_TYPE_TIME_STAMP;
26636 + event->type_len = RINGBUF_TYPE_TIME_EXTEND;
26638 - /* Not the first event on the page? */
26639 - if (rb_event_index(event)) {
26640 + /* Not the first event on the page, or not delta? */
26641 + if (abs || rb_event_index(event)) {
26642 event->time_delta = delta & TS_MASK;
26643 event->array[0] = delta >> TS_SHIFT;
26645 @@ -2265,7 +2306,9 @@
26646 * add it to the start of the resevered space.
26648 if (unlikely(info->add_timestamp)) {
26649 - event = rb_add_time_stamp(event, delta);
26650 + bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
26652 + event = rb_add_time_stamp(event, info->delta, abs);
26653 length -= RB_LEN_TIME_EXTEND;
26656 @@ -2453,7 +2496,7 @@
26658 static inline void rb_event_discard(struct ring_buffer_event *event)
26660 - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
26661 + if (extended_time(event))
26662 event = skip_time_extend(event);
26664 /* array[0] holds the actual length for the discarded event */
26665 @@ -2497,10 +2540,11 @@
26666 cpu_buffer->write_stamp =
26667 cpu_buffer->commit_page->page->time_stamp;
26668 else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
26669 - delta = event->array[0];
26670 - delta <<= TS_SHIFT;
26671 - delta += event->time_delta;
26672 + delta = ring_buffer_event_time_stamp(event);
26673 cpu_buffer->write_stamp += delta;
26674 + } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
26675 + delta = ring_buffer_event_time_stamp(event);
26676 + cpu_buffer->write_stamp = delta;
26678 cpu_buffer->write_stamp += event->time_delta;
26680 @@ -2583,22 +2627,19 @@
26681 trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
26683 unsigned int val = cpu_buffer->current_context;
26684 + unsigned long pc = preempt_count();
26687 - if (in_interrupt()) {
26689 - bit = RB_CTX_NMI;
26690 - else if (in_irq())
26691 - bit = RB_CTX_IRQ;
26693 - bit = RB_CTX_SOFTIRQ;
26695 + if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
26696 bit = RB_CTX_NORMAL;
26698 + bit = pc & NMI_MASK ? RB_CTX_NMI :
26699 + pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
26701 - if (unlikely(val & (1 << bit)))
26702 + if (unlikely(val & (1 << (bit + cpu_buffer->nest))))
26705 - val |= (1 << bit);
26706 + val |= (1 << (bit + cpu_buffer->nest));
26707 cpu_buffer->current_context = val;
26710 @@ -2607,7 +2648,57 @@
26711 static __always_inline void
26712 trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
26714 - cpu_buffer->current_context &= cpu_buffer->current_context - 1;
26715 + cpu_buffer->current_context &=
26716 + cpu_buffer->current_context - (1 << cpu_buffer->nest);
26719 +/* The recursive locking above uses 4 bits */
26720 +#define NESTED_BITS 4
26723 + * ring_buffer_nest_start - Allow to trace while nested
26724 + * @buffer: The ring buffer to modify
26726 + * The ring buffer has a safty mechanism to prevent recursion.
26727 + * But there may be a case where a trace needs to be done while
26728 + * tracing something else. In this case, calling this function
26729 + * will allow this function to nest within a currently active
26730 + * ring_buffer_lock_reserve().
26732 + * Call this function before calling another ring_buffer_lock_reserve() and
26733 + * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit().
26735 +void ring_buffer_nest_start(struct ring_buffer *buffer)
26737 + struct ring_buffer_per_cpu *cpu_buffer;
26740 + /* Enabled by ring_buffer_nest_end() */
26741 + preempt_disable_notrace();
26742 + cpu = raw_smp_processor_id();
26743 + cpu_buffer = buffer->buffers[cpu];
26744 + /* This is the shift value for the above recusive locking */
26745 + cpu_buffer->nest += NESTED_BITS;
26749 + * ring_buffer_nest_end - Allow to trace while nested
26750 + * @buffer: The ring buffer to modify
26752 + * Must be called after ring_buffer_nest_start() and after the
26753 + * ring_buffer_unlock_commit().
26755 +void ring_buffer_nest_end(struct ring_buffer *buffer)
26757 + struct ring_buffer_per_cpu *cpu_buffer;
26760 + /* disabled by ring_buffer_nest_start() */
26761 + cpu = raw_smp_processor_id();
26762 + cpu_buffer = buffer->buffers[cpu];
26763 + /* This is the shift value for the above recusive locking */
26764 + cpu_buffer->nest -= NESTED_BITS;
26765 + preempt_enable_notrace();
26769 @@ -2683,7 +2774,7 @@
26770 * If this is the first commit on the page, then it has the same
26771 * timestamp as the page itself.
26774 + if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer))
26777 /* See if we shot pass the end of this buffer page */
26778 @@ -2760,8 +2851,11 @@
26779 /* make sure this diff is calculated here */
26782 - /* Did the write stamp get updated already? */
26783 - if (likely(info.ts >= cpu_buffer->write_stamp)) {
26784 + if (ring_buffer_time_stamp_abs(buffer)) {
26785 + info.delta = info.ts;
26786 + rb_handle_timestamp(cpu_buffer, &info);
26787 + } else /* Did the write stamp get updated already? */
26788 + if (likely(info.ts >= cpu_buffer->write_stamp)) {
26790 if (unlikely(test_time_stamp(info.delta)))
26791 rb_handle_timestamp(cpu_buffer, &info);
26792 @@ -3459,14 +3553,13 @@
26795 case RINGBUF_TYPE_TIME_EXTEND:
26796 - delta = event->array[0];
26797 - delta <<= TS_SHIFT;
26798 - delta += event->time_delta;
26799 + delta = ring_buffer_event_time_stamp(event);
26800 cpu_buffer->read_stamp += delta;
26803 case RINGBUF_TYPE_TIME_STAMP:
26804 - /* FIXME: not implemented */
26805 + delta = ring_buffer_event_time_stamp(event);
26806 + cpu_buffer->read_stamp = delta;
26809 case RINGBUF_TYPE_DATA:
26810 @@ -3490,14 +3583,13 @@
26813 case RINGBUF_TYPE_TIME_EXTEND:
26814 - delta = event->array[0];
26815 - delta <<= TS_SHIFT;
26816 - delta += event->time_delta;
26817 + delta = ring_buffer_event_time_stamp(event);
26818 iter->read_stamp += delta;
26821 case RINGBUF_TYPE_TIME_STAMP:
26822 - /* FIXME: not implemented */
26823 + delta = ring_buffer_event_time_stamp(event);
26824 + iter->read_stamp = delta;
26827 case RINGBUF_TYPE_DATA:
26828 @@ -3721,6 +3813,8 @@
26829 struct buffer_page *reader;
26836 * We repeat when a time extend is encountered.
26837 @@ -3757,12 +3851,17 @@
26840 case RINGBUF_TYPE_TIME_STAMP:
26841 - /* FIXME: not implemented */
26843 + *ts = ring_buffer_event_time_stamp(event);
26844 + ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
26845 + cpu_buffer->cpu, ts);
26847 + /* Internal data, OK to advance */
26848 rb_advance_reader(cpu_buffer);
26851 case RINGBUF_TYPE_DATA:
26853 + if (ts && !(*ts)) {
26854 *ts = cpu_buffer->read_stamp + event->time_delta;
26855 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
26856 cpu_buffer->cpu, ts);
26857 @@ -3787,6 +3886,9 @@
26858 struct ring_buffer_event *event;
26864 cpu_buffer = iter->cpu_buffer;
26865 buffer = cpu_buffer->buffer;
26867 @@ -3839,12 +3941,17 @@
26870 case RINGBUF_TYPE_TIME_STAMP:
26871 - /* FIXME: not implemented */
26873 + *ts = ring_buffer_event_time_stamp(event);
26874 + ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
26875 + cpu_buffer->cpu, ts);
26877 + /* Internal data, OK to advance */
26878 rb_advance_iter(iter);
26881 case RINGBUF_TYPE_DATA:
26883 + if (ts && !(*ts)) {
26884 *ts = iter->read_stamp + event->time_delta;
26885 ring_buffer_normalize_time_stamp(buffer,
26886 cpu_buffer->cpu, ts);
26887 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace.c linux-4.14/kernel/trace/trace.c
26888 --- linux-4.14.orig/kernel/trace/trace.c 2018-09-05 11:03:22.000000000 +0200
26889 +++ linux-4.14/kernel/trace/trace.c 2018-09-05 11:05:07.000000000 +0200
26890 @@ -1170,6 +1170,14 @@
26894 +bool trace_clock_in_ns(struct trace_array *tr)
26896 + if (trace_clocks[tr->clock_id].in_ns)
26903 * trace_parser_get_init - gets the buffer for trace parser
26905 @@ -2127,6 +2135,7 @@
26906 struct task_struct *tsk = current;
26908 entry->preempt_count = pc & 0xff;
26909 + entry->preempt_lazy_count = preempt_lazy_count();
26910 entry->pid = (tsk) ? tsk->pid : 0;
26912 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
26913 @@ -2137,8 +2146,11 @@
26914 ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) |
26915 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
26916 ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
26917 - (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
26918 + (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
26919 + (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
26920 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
26922 + entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
26924 EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
26926 @@ -2275,7 +2287,7 @@
26928 *current_rb = trace_file->tr->trace_buffer.buffer;
26930 - if ((trace_file->flags &
26931 + if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags &
26932 (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
26933 (entry = this_cpu_read(trace_buffered_event))) {
26934 /* Try to use the per cpu buffer first */
26935 @@ -3342,14 +3354,17 @@
26937 static void print_lat_help_header(struct seq_file *m)
26939 - seq_puts(m, "# _------=> CPU# \n"
26940 - "# / _-----=> irqs-off \n"
26941 - "# | / _----=> need-resched \n"
26942 - "# || / _---=> hardirq/softirq \n"
26943 - "# ||| / _--=> preempt-depth \n"
26944 - "# |||| / delay \n"
26945 - "# cmd pid ||||| time | caller \n"
26946 - "# \\ / ||||| \\ | / \n");
26947 + seq_puts(m, "# _--------=> CPU# \n"
26948 + "# / _-------=> irqs-off \n"
26949 + "# | / _------=> need-resched \n"
26950 + "# || / _-----=> need-resched_lazy \n"
26951 + "# ||| / _----=> hardirq/softirq \n"
26952 + "# |||| / _---=> preempt-depth \n"
26953 + "# ||||| / _--=> preempt-lazy-depth\n"
26954 + "# |||||| / _-=> migrate-disable \n"
26955 + "# ||||||| / delay \n"
26956 + "# cmd pid |||||||| time | caller \n"
26957 + "# \\ / |||||||| \\ | / \n");
26960 static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
26961 @@ -3385,15 +3400,17 @@
26962 tgid ? tgid_space : space);
26963 seq_printf(m, "# %s / _----=> need-resched\n",
26964 tgid ? tgid_space : space);
26965 - seq_printf(m, "# %s| / _---=> hardirq/softirq\n",
26966 + seq_printf(m, "# %s| / _----=> need-resched_lazy\n",
26967 tgid ? tgid_space : space);
26968 - seq_printf(m, "# %s|| / _--=> preempt-depth\n",
26969 + seq_printf(m, "# %s|| / _---=> hardirq/softirq\n",
26970 tgid ? tgid_space : space);
26971 - seq_printf(m, "# %s||| / delay\n",
26972 + seq_printf(m, "# %s||| / _--=> preempt-depth\n",
26973 tgid ? tgid_space : space);
26974 - seq_printf(m, "# TASK-PID %sCPU# |||| TIMESTAMP FUNCTION\n",
26975 + seq_printf(m, "# %s|||| / delay\n",
26976 + tgid ? tgid_space : space);
26977 + seq_printf(m, "# TASK-PID %sCPU# ||||| TIMESTAMP FUNCTION\n",
26978 tgid ? " TGID " : space);
26979 - seq_printf(m, "# | | %s | |||| | |\n",
26980 + seq_printf(m, "# | | %s | ||||| | |\n",
26981 tgid ? " | " : space);
26984 @@ -4531,6 +4548,9 @@
26985 #ifdef CONFIG_X86_64
26986 " x86-tsc: TSC cycle counter\n"
26988 + "\n timestamp_mode\t-view the mode used to timestamp events\n"
26989 + " delta: Delta difference against a buffer-wide timestamp\n"
26990 + " absolute: Absolute (standalone) timestamp\n"
26991 "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
26992 "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n"
26993 " tracing_cpumask\t- Limit which CPUs to trace\n"
26994 @@ -4707,8 +4727,9 @@
26995 "\t .sym display an address as a symbol\n"
26996 "\t .sym-offset display an address as a symbol and offset\n"
26997 "\t .execname display a common_pid as a program name\n"
26998 - "\t .syscall display a syscall id as a syscall name\n\n"
26999 - "\t .log2 display log2 value rather than raw number\n\n"
27000 + "\t .syscall display a syscall id as a syscall name\n"
27001 + "\t .log2 display log2 value rather than raw number\n"
27002 + "\t .usecs display a common_timestamp in microseconds\n\n"
27003 "\t The 'pause' parameter can be used to pause an existing hist\n"
27004 "\t trigger or to start a hist trigger but not log any events\n"
27005 "\t until told to do so. 'continue' can be used to start or\n"
27006 @@ -6218,7 +6239,7 @@
27010 -static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
27011 +int tracing_set_clock(struct trace_array *tr, const char *clockstr)
27015 @@ -6298,6 +6319,71 @@
27019 +static int tracing_time_stamp_mode_show(struct seq_file *m, void *v)
27021 + struct trace_array *tr = m->private;
27023 + mutex_lock(&trace_types_lock);
27025 + if (ring_buffer_time_stamp_abs(tr->trace_buffer.buffer))
27026 + seq_puts(m, "delta [absolute]\n");
27028 + seq_puts(m, "[delta] absolute\n");
27030 + mutex_unlock(&trace_types_lock);
27035 +static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file)
27037 + struct trace_array *tr = inode->i_private;
27040 + if (tracing_disabled)
27043 + if (trace_array_get(tr))
27046 + ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private);
27048 + trace_array_put(tr);
27053 +int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs)
27057 + mutex_lock(&trace_types_lock);
27059 + if (abs && tr->time_stamp_abs_ref++)
27063 + if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) {
27068 + if (--tr->time_stamp_abs_ref)
27072 + ring_buffer_set_time_stamp_abs(tr->trace_buffer.buffer, abs);
27074 +#ifdef CONFIG_TRACER_MAX_TRACE
27075 + if (tr->max_buffer.buffer)
27076 + ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs);
27079 + mutex_unlock(&trace_types_lock);
27084 struct ftrace_buffer_info {
27085 struct trace_iterator iter;
27087 @@ -6545,6 +6631,13 @@
27088 .write = tracing_clock_write,
27091 +static const struct file_operations trace_time_stamp_mode_fops = {
27092 + .open = tracing_time_stamp_mode_open,
27093 + .read = seq_read,
27094 + .llseek = seq_lseek,
27095 + .release = tracing_single_release_tr,
27098 #ifdef CONFIG_TRACER_SNAPSHOT
27099 static const struct file_operations snapshot_fops = {
27100 .open = tracing_snapshot_open,
27101 @@ -7682,6 +7775,7 @@
27102 struct trace_array *tr;
27105 + mutex_lock(&event_mutex);
27106 mutex_lock(&trace_types_lock);
27109 @@ -7714,6 +7808,7 @@
27111 INIT_LIST_HEAD(&tr->systems);
27112 INIT_LIST_HEAD(&tr->events);
27113 + INIT_LIST_HEAD(&tr->hist_vars);
27115 if (allocate_trace_buffers(tr, trace_buf_size) < 0)
27117 @@ -7737,6 +7832,7 @@
27118 list_add(&tr->list, &ftrace_trace_arrays);
27120 mutex_unlock(&trace_types_lock);
27121 + mutex_unlock(&event_mutex);
27125 @@ -7748,6 +7844,7 @@
27128 mutex_unlock(&trace_types_lock);
27129 + mutex_unlock(&event_mutex);
27133 @@ -7760,6 +7857,7 @@
27137 + mutex_lock(&event_mutex);
27138 mutex_lock(&trace_types_lock);
27141 @@ -7805,6 +7903,7 @@
27144 mutex_unlock(&trace_types_lock);
27145 + mutex_unlock(&event_mutex);
27149 @@ -7862,6 +7961,9 @@
27150 trace_create_file("tracing_on", 0644, d_tracer,
27151 tr, &rb_simple_fops);
27153 + trace_create_file("timestamp_mode", 0444, d_tracer, tr,
27154 + &trace_time_stamp_mode_fops);
27156 create_trace_options_dir(tr);
27158 #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
27159 @@ -8271,6 +8373,92 @@
27161 EXPORT_SYMBOL_GPL(ftrace_dump);
27163 +int trace_run_command(const char *buf, int (*createfn)(int, char **))
27170 + argv = argv_split(GFP_KERNEL, buf, &argc);
27175 + ret = createfn(argc, argv);
27182 +#define WRITE_BUFSIZE 4096
27184 +ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
27185 + size_t count, loff_t *ppos,
27186 + int (*createfn)(int, char **))
27188 + char *kbuf, *buf, *tmp;
27193 + kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
27197 + while (done < count) {
27198 + size = count - done;
27200 + if (size >= WRITE_BUFSIZE)
27201 + size = WRITE_BUFSIZE - 1;
27203 + if (copy_from_user(kbuf, buffer + done, size)) {
27207 + kbuf[size] = '\0';
27210 + tmp = strchr(buf, '\n');
27213 + size = tmp - buf + 1;
27215 + size = strlen(buf);
27216 + if (done + size < count) {
27219 + /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
27220 + pr_warn("Line length is too long: Should be less than %d\n",
27221 + WRITE_BUFSIZE - 2);
27228 + /* Remove comments */
27229 + tmp = strchr(buf, '#');
27234 + ret = trace_run_command(buf, createfn);
27239 + } while (done < count);
27249 __init static int tracer_alloc_buffers(void)
27252 @@ -8371,6 +8559,7 @@
27254 INIT_LIST_HEAD(&global_trace.systems);
27255 INIT_LIST_HEAD(&global_trace.events);
27256 + INIT_LIST_HEAD(&global_trace.hist_vars);
27257 list_add(&global_trace.list, &ftrace_trace_arrays);
27259 apply_trace_boot_options();
27260 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_events.c linux-4.14/kernel/trace/trace_events.c
27261 --- linux-4.14.orig/kernel/trace/trace_events.c 2018-09-05 11:03:22.000000000 +0200
27262 +++ linux-4.14/kernel/trace/trace_events.c 2018-09-05 11:05:07.000000000 +0200
27263 @@ -187,6 +187,8 @@
27264 __common_field(unsigned char, flags);
27265 __common_field(unsigned char, preempt_count);
27266 __common_field(int, pid);
27267 + __common_field(unsigned short, migrate_disable);
27268 + __common_field(unsigned short, padding);
27272 @@ -1406,8 +1408,8 @@
27275 /* Make sure the system still exists */
27276 - mutex_lock(&trace_types_lock);
27277 mutex_lock(&event_mutex);
27278 + mutex_lock(&trace_types_lock);
27279 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
27280 list_for_each_entry(dir, &tr->systems, list) {
27281 if (dir == inode->i_private) {
27282 @@ -1421,8 +1423,8 @@
27286 - mutex_unlock(&event_mutex);
27287 mutex_unlock(&trace_types_lock);
27288 + mutex_unlock(&event_mutex);
27292 @@ -2308,15 +2310,15 @@
27293 int trace_add_event_call(struct trace_event_call *call)
27296 - mutex_lock(&trace_types_lock);
27297 mutex_lock(&event_mutex);
27298 + mutex_lock(&trace_types_lock);
27300 ret = __register_event(call, NULL);
27302 __add_event_to_tracers(call);
27304 - mutex_unlock(&event_mutex);
27305 mutex_unlock(&trace_types_lock);
27306 + mutex_unlock(&event_mutex);
27310 @@ -2370,13 +2372,13 @@
27314 - mutex_lock(&trace_types_lock);
27315 mutex_lock(&event_mutex);
27316 + mutex_lock(&trace_types_lock);
27317 down_write(&trace_event_sem);
27318 ret = probe_remove_event_call(call);
27319 up_write(&trace_event_sem);
27320 - mutex_unlock(&event_mutex);
27321 mutex_unlock(&trace_types_lock);
27322 + mutex_unlock(&event_mutex);
27326 @@ -2438,8 +2440,8 @@
27328 struct module *mod = data;
27330 - mutex_lock(&trace_types_lock);
27331 mutex_lock(&event_mutex);
27332 + mutex_lock(&trace_types_lock);
27334 case MODULE_STATE_COMING:
27335 trace_module_add_events(mod);
27336 @@ -2448,8 +2450,8 @@
27337 trace_module_remove_events(mod);
27340 - mutex_unlock(&event_mutex);
27341 mutex_unlock(&trace_types_lock);
27342 + mutex_unlock(&event_mutex);
27346 @@ -2964,24 +2966,24 @@
27347 * creates the event hierachry in the @parent/events directory.
27349 * Returns 0 on success.
27351 + * Must be called with event_mutex held.
27353 int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
27357 - mutex_lock(&event_mutex);
27358 + lockdep_assert_held(&event_mutex);
27360 ret = create_event_toplevel_files(parent, tr);
27365 down_write(&trace_event_sem);
27366 __trace_add_event_dirs(tr);
27367 up_write(&trace_event_sem);
27370 - mutex_unlock(&event_mutex);
27376 @@ -3010,9 +3012,10 @@
27380 +/* Must be called with event_mutex held */
27381 int event_trace_del_tracer(struct trace_array *tr)
27383 - mutex_lock(&event_mutex);
27384 + lockdep_assert_held(&event_mutex);
27386 /* Disable any event triggers and associated soft-disabled events */
27387 clear_event_triggers(tr);
27388 @@ -3033,8 +3036,6 @@
27390 tr->event_dir = NULL;
27392 - mutex_unlock(&event_mutex);
27397 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_events_hist.c linux-4.14/kernel/trace/trace_events_hist.c
27398 --- linux-4.14.orig/kernel/trace/trace_events_hist.c 2018-09-05 11:03:22.000000000 +0200
27399 +++ linux-4.14/kernel/trace/trace_events_hist.c 2018-09-05 11:05:07.000000000 +0200
27400 @@ -20,13 +20,39 @@
27401 #include <linux/slab.h>
27402 #include <linux/stacktrace.h>
27403 #include <linux/rculist.h>
27404 +#include <linux/tracefs.h>
27406 #include "tracing_map.h"
27409 +#define SYNTH_SYSTEM "synthetic"
27410 +#define SYNTH_FIELDS_MAX 16
27412 +#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */
27416 -typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
27417 +typedef u64 (*hist_field_fn_t) (struct hist_field *field,
27418 + struct tracing_map_elt *elt,
27419 + struct ring_buffer_event *rbe,
27422 +#define HIST_FIELD_OPERANDS_MAX 2
27423 +#define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX)
27424 +#define HIST_ACTIONS_MAX 8
27426 +enum field_op_id {
27430 + FIELD_OP_UNARY_MINUS,
27435 + struct hist_trigger_data *hist_data;
27436 + unsigned int idx;
27439 struct hist_field {
27440 struct ftrace_event_field *field;
27441 @@ -34,26 +60,50 @@
27442 hist_field_fn_t fn;
27444 unsigned int offset;
27445 + unsigned int is_signed;
27446 + const char *type;
27447 + struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
27448 + struct hist_trigger_data *hist_data;
27449 + struct hist_var var;
27450 + enum field_op_id operator;
27452 + char *event_name;
27454 + unsigned int var_idx;
27455 + unsigned int var_ref_idx;
27459 -static u64 hist_field_none(struct hist_field *field, void *event)
27460 +static u64 hist_field_none(struct hist_field *field,
27461 + struct tracing_map_elt *elt,
27462 + struct ring_buffer_event *rbe,
27468 -static u64 hist_field_counter(struct hist_field *field, void *event)
27469 +static u64 hist_field_counter(struct hist_field *field,
27470 + struct tracing_map_elt *elt,
27471 + struct ring_buffer_event *rbe,
27477 -static u64 hist_field_string(struct hist_field *hist_field, void *event)
27478 +static u64 hist_field_string(struct hist_field *hist_field,
27479 + struct tracing_map_elt *elt,
27480 + struct ring_buffer_event *rbe,
27483 char *addr = (char *)(event + hist_field->field->offset);
27485 return (u64)(unsigned long)addr;
27488 -static u64 hist_field_dynstring(struct hist_field *hist_field, void *event)
27489 +static u64 hist_field_dynstring(struct hist_field *hist_field,
27490 + struct tracing_map_elt *elt,
27491 + struct ring_buffer_event *rbe,
27494 u32 str_item = *(u32 *)(event + hist_field->field->offset);
27495 int str_loc = str_item & 0xffff;
27496 @@ -62,22 +112,74 @@
27497 return (u64)(unsigned long)addr;
27500 -static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
27501 +static u64 hist_field_pstring(struct hist_field *hist_field,
27502 + struct tracing_map_elt *elt,
27503 + struct ring_buffer_event *rbe,
27506 char **addr = (char **)(event + hist_field->field->offset);
27508 return (u64)(unsigned long)*addr;
27511 -static u64 hist_field_log2(struct hist_field *hist_field, void *event)
27512 +static u64 hist_field_log2(struct hist_field *hist_field,
27513 + struct tracing_map_elt *elt,
27514 + struct ring_buffer_event *rbe,
27517 - u64 val = *(u64 *)(event + hist_field->field->offset);
27518 + struct hist_field *operand = hist_field->operands[0];
27520 + u64 val = operand->fn(operand, elt, rbe, event);
27522 return (u64) ilog2(roundup_pow_of_two(val));
27525 +static u64 hist_field_plus(struct hist_field *hist_field,
27526 + struct tracing_map_elt *elt,
27527 + struct ring_buffer_event *rbe,
27530 + struct hist_field *operand1 = hist_field->operands[0];
27531 + struct hist_field *operand2 = hist_field->operands[1];
27533 + u64 val1 = operand1->fn(operand1, elt, rbe, event);
27534 + u64 val2 = operand2->fn(operand2, elt, rbe, event);
27536 + return val1 + val2;
27539 +static u64 hist_field_minus(struct hist_field *hist_field,
27540 + struct tracing_map_elt *elt,
27541 + struct ring_buffer_event *rbe,
27544 + struct hist_field *operand1 = hist_field->operands[0];
27545 + struct hist_field *operand2 = hist_field->operands[1];
27547 + u64 val1 = operand1->fn(operand1, elt, rbe, event);
27548 + u64 val2 = operand2->fn(operand2, elt, rbe, event);
27550 + return val1 - val2;
27553 +static u64 hist_field_unary_minus(struct hist_field *hist_field,
27554 + struct tracing_map_elt *elt,
27555 + struct ring_buffer_event *rbe,
27558 + struct hist_field *operand = hist_field->operands[0];
27560 + s64 sval = (s64)operand->fn(operand, elt, rbe, event);
27561 + u64 val = (u64)-sval;
27566 #define DEFINE_HIST_FIELD_FN(type) \
27567 -static u64 hist_field_##type(struct hist_field *hist_field, void *event)\
27568 + static u64 hist_field_##type(struct hist_field *hist_field, \
27569 + struct tracing_map_elt *elt, \
27570 + struct ring_buffer_event *rbe, \
27573 type *addr = (type *)(event + hist_field->field->offset); \
27575 @@ -110,16 +212,29 @@
27576 #define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE)
27578 enum hist_field_flags {
27579 - HIST_FIELD_FL_HITCOUNT = 1,
27580 - HIST_FIELD_FL_KEY = 2,
27581 - HIST_FIELD_FL_STRING = 4,
27582 - HIST_FIELD_FL_HEX = 8,
27583 - HIST_FIELD_FL_SYM = 16,
27584 - HIST_FIELD_FL_SYM_OFFSET = 32,
27585 - HIST_FIELD_FL_EXECNAME = 64,
27586 - HIST_FIELD_FL_SYSCALL = 128,
27587 - HIST_FIELD_FL_STACKTRACE = 256,
27588 - HIST_FIELD_FL_LOG2 = 512,
27589 + HIST_FIELD_FL_HITCOUNT = 1 << 0,
27590 + HIST_FIELD_FL_KEY = 1 << 1,
27591 + HIST_FIELD_FL_STRING = 1 << 2,
27592 + HIST_FIELD_FL_HEX = 1 << 3,
27593 + HIST_FIELD_FL_SYM = 1 << 4,
27594 + HIST_FIELD_FL_SYM_OFFSET = 1 << 5,
27595 + HIST_FIELD_FL_EXECNAME = 1 << 6,
27596 + HIST_FIELD_FL_SYSCALL = 1 << 7,
27597 + HIST_FIELD_FL_STACKTRACE = 1 << 8,
27598 + HIST_FIELD_FL_LOG2 = 1 << 9,
27599 + HIST_FIELD_FL_TIMESTAMP = 1 << 10,
27600 + HIST_FIELD_FL_TIMESTAMP_USECS = 1 << 11,
27601 + HIST_FIELD_FL_VAR = 1 << 12,
27602 + HIST_FIELD_FL_EXPR = 1 << 13,
27603 + HIST_FIELD_FL_VAR_REF = 1 << 14,
27604 + HIST_FIELD_FL_CPU = 1 << 15,
27605 + HIST_FIELD_FL_ALIAS = 1 << 16,
27609 + unsigned int n_vars;
27610 + char *name[TRACING_MAP_VARS_MAX];
27611 + char *expr[TRACING_MAP_VARS_MAX];
27614 struct hist_trigger_attrs {
27615 @@ -127,25 +242,1474 @@
27617 char *sort_key_str;
27623 + bool ts_in_usecs;
27624 unsigned int map_bits;
27626 + char *assignment_str[TRACING_MAP_VARS_MAX];
27627 + unsigned int n_assignments;
27629 + char *action_str[HIST_ACTIONS_MAX];
27630 + unsigned int n_actions;
27632 + struct var_defs var_defs;
27635 +struct field_var {
27636 + struct hist_field *var;
27637 + struct hist_field *val;
27640 +struct field_var_hist {
27641 + struct hist_trigger_data *hist_data;
27645 struct hist_trigger_data {
27646 - struct hist_field *fields[TRACING_MAP_FIELDS_MAX];
27647 + struct hist_field *fields[HIST_FIELDS_MAX];
27648 unsigned int n_vals;
27649 unsigned int n_keys;
27650 unsigned int n_fields;
27651 + unsigned int n_vars;
27652 unsigned int key_size;
27653 struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX];
27654 unsigned int n_sort_keys;
27655 struct trace_event_file *event_file;
27656 struct hist_trigger_attrs *attrs;
27657 struct tracing_map *map;
27658 + bool enable_timestamps;
27660 + struct hist_field *var_refs[TRACING_MAP_VARS_MAX];
27661 + unsigned int n_var_refs;
27663 + struct action_data *actions[HIST_ACTIONS_MAX];
27664 + unsigned int n_actions;
27666 + struct hist_field *synth_var_refs[SYNTH_FIELDS_MAX];
27667 + unsigned int n_synth_var_refs;
27668 + struct field_var *field_vars[SYNTH_FIELDS_MAX];
27669 + unsigned int n_field_vars;
27670 + unsigned int n_field_var_str;
27671 + struct field_var_hist *field_var_hists[SYNTH_FIELDS_MAX];
27672 + unsigned int n_field_var_hists;
27674 + struct field_var *max_vars[SYNTH_FIELDS_MAX];
27675 + unsigned int n_max_vars;
27676 + unsigned int n_max_var_str;
27679 +struct synth_field {
27687 +struct synth_event {
27688 + struct list_head list;
27691 + struct synth_field **fields;
27692 + unsigned int n_fields;
27693 + unsigned int n_u64;
27694 + struct trace_event_class class;
27695 + struct trace_event_call call;
27696 + struct tracepoint *tp;
27699 +struct action_data;
27701 +typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
27702 + struct tracing_map_elt *elt, void *rec,
27703 + struct ring_buffer_event *rbe,
27704 + struct action_data *data, u64 *var_ref_vals);
27706 +struct action_data {
27708 + unsigned int n_params;
27709 + char *params[SYNTH_FIELDS_MAX];
27713 + unsigned int var_ref_idx;
27714 + char *match_event;
27715 + char *match_event_system;
27716 + char *synth_event_name;
27717 + struct synth_event *synth_event;
27723 + unsigned int max_var_ref_idx;
27724 + struct hist_field *max_var;
27725 + struct hist_field *var;
27731 +static char last_hist_cmd[MAX_FILTER_STR_VAL];
27732 +static char hist_err_str[MAX_FILTER_STR_VAL];
27734 +static void last_cmd_set(char *str)
27739 + strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1);
27742 +static void hist_err(char *str, char *var)
27744 + int maxlen = MAX_FILTER_STR_VAL - 1;
27749 + if (strlen(hist_err_str))
27755 + if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen)
27758 + strcat(hist_err_str, str);
27759 + strcat(hist_err_str, var);
27762 +static void hist_err_event(char *str, char *system, char *event, char *var)
27764 + char err[MAX_FILTER_STR_VAL];
27766 + if (system && var)
27767 + snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var);
27769 + snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event);
27771 + strncpy(err, var, MAX_FILTER_STR_VAL);
27773 + hist_err(str, err);
27776 +static void hist_err_clear(void)
27778 + hist_err_str[0] = '\0';
27781 +static bool have_hist_err(void)
27783 + if (strlen(hist_err_str))
27789 +static LIST_HEAD(synth_event_list);
27790 +static DEFINE_MUTEX(synth_event_mutex);
27792 +struct synth_trace_event {
27793 + struct trace_entry ent;
27797 +static int synth_event_define_fields(struct trace_event_call *call)
27799 + struct synth_trace_event trace;
27800 + int offset = offsetof(typeof(trace), fields);
27801 + struct synth_event *event = call->data;
27802 + unsigned int i, size, n_u64;
27803 + char *name, *type;
27807 + for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
27808 + size = event->fields[i]->size;
27809 + is_signed = event->fields[i]->is_signed;
27810 + type = event->fields[i]->type;
27811 + name = event->fields[i]->name;
27812 + ret = trace_define_field(call, type, name, offset, size,
27813 + is_signed, FILTER_OTHER);
27817 + if (event->fields[i]->is_string) {
27818 + offset += STR_VAR_LEN_MAX;
27819 + n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
27821 + offset += sizeof(u64);
27826 + event->n_u64 = n_u64;
27831 +static bool synth_field_signed(char *type)
27833 + if (strncmp(type, "u", 1) == 0)
27839 +static int synth_field_is_string(char *type)
27841 + if (strstr(type, "char[") != NULL)
27847 +static int synth_field_string_size(char *type)
27849 + char buf[4], *end, *start;
27850 + unsigned int len;
27853 + start = strstr(type, "char[");
27854 + if (start == NULL)
27856 + start += strlen("char[");
27858 + end = strchr(type, ']');
27859 + if (!end || end < start)
27862 + len = end - start;
27866 + strncpy(buf, start, len);
27869 + err = kstrtouint(buf, 0, &size);
27873 + if (size > STR_VAR_LEN_MAX)
27879 +static int synth_field_size(char *type)
27883 + if (strcmp(type, "s64") == 0)
27884 + size = sizeof(s64);
27885 + else if (strcmp(type, "u64") == 0)
27886 + size = sizeof(u64);
27887 + else if (strcmp(type, "s32") == 0)
27888 + size = sizeof(s32);
27889 + else if (strcmp(type, "u32") == 0)
27890 + size = sizeof(u32);
27891 + else if (strcmp(type, "s16") == 0)
27892 + size = sizeof(s16);
27893 + else if (strcmp(type, "u16") == 0)
27894 + size = sizeof(u16);
27895 + else if (strcmp(type, "s8") == 0)
27896 + size = sizeof(s8);
27897 + else if (strcmp(type, "u8") == 0)
27898 + size = sizeof(u8);
27899 + else if (strcmp(type, "char") == 0)
27900 + size = sizeof(char);
27901 + else if (strcmp(type, "unsigned char") == 0)
27902 + size = sizeof(unsigned char);
27903 + else if (strcmp(type, "int") == 0)
27904 + size = sizeof(int);
27905 + else if (strcmp(type, "unsigned int") == 0)
27906 + size = sizeof(unsigned int);
27907 + else if (strcmp(type, "long") == 0)
27908 + size = sizeof(long);
27909 + else if (strcmp(type, "unsigned long") == 0)
27910 + size = sizeof(unsigned long);
27911 + else if (strcmp(type, "pid_t") == 0)
27912 + size = sizeof(pid_t);
27913 + else if (synth_field_is_string(type))
27914 + size = synth_field_string_size(type);
27919 +static const char *synth_field_fmt(char *type)
27921 + const char *fmt = "%llu";
27923 + if (strcmp(type, "s64") == 0)
27925 + else if (strcmp(type, "u64") == 0)
27927 + else if (strcmp(type, "s32") == 0)
27929 + else if (strcmp(type, "u32") == 0)
27931 + else if (strcmp(type, "s16") == 0)
27933 + else if (strcmp(type, "u16") == 0)
27935 + else if (strcmp(type, "s8") == 0)
27937 + else if (strcmp(type, "u8") == 0)
27939 + else if (strcmp(type, "char") == 0)
27941 + else if (strcmp(type, "unsigned char") == 0)
27943 + else if (strcmp(type, "int") == 0)
27945 + else if (strcmp(type, "unsigned int") == 0)
27947 + else if (strcmp(type, "long") == 0)
27949 + else if (strcmp(type, "unsigned long") == 0)
27951 + else if (strcmp(type, "pid_t") == 0)
27953 + else if (synth_field_is_string(type))
27959 +static enum print_line_t print_synth_event(struct trace_iterator *iter,
27961 + struct trace_event *event)
27963 + struct trace_array *tr = iter->tr;
27964 + struct trace_seq *s = &iter->seq;
27965 + struct synth_trace_event *entry;
27966 + struct synth_event *se;
27967 + unsigned int i, n_u64;
27968 + char print_fmt[32];
27971 + entry = (struct synth_trace_event *)iter->ent;
27972 + se = container_of(event, struct synth_event, call.event);
27974 + trace_seq_printf(s, "%s: ", se->name);
27976 + for (i = 0, n_u64 = 0; i < se->n_fields; i++) {
27977 + if (trace_seq_has_overflowed(s))
27980 + fmt = synth_field_fmt(se->fields[i]->type);
27982 + /* parameter types */
27983 + if (tr->trace_flags & TRACE_ITER_VERBOSE)
27984 + trace_seq_printf(s, "%s ", fmt);
27986 + snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt);
27988 + /* parameter values */
27989 + if (se->fields[i]->is_string) {
27990 + trace_seq_printf(s, print_fmt, se->fields[i]->name,
27991 + (char *)&entry->fields[n_u64],
27992 + i == se->n_fields - 1 ? "" : " ");
27993 + n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
27995 + trace_seq_printf(s, print_fmt, se->fields[i]->name,
27996 + entry->fields[n_u64],
27997 + i == se->n_fields - 1 ? "" : " ");
28002 + trace_seq_putc(s, '\n');
28004 + return trace_handle_return(s);
28007 +static struct trace_event_functions synth_event_funcs = {
28008 + .trace = print_synth_event
28011 +static notrace void trace_event_raw_event_synth(void *__data,
28012 + u64 *var_ref_vals,
28013 + unsigned int var_ref_idx)
28015 + struct trace_event_file *trace_file = __data;
28016 + struct synth_trace_event *entry;
28017 + struct trace_event_buffer fbuffer;
28018 + struct ring_buffer *buffer;
28019 + struct synth_event *event;
28020 + unsigned int i, n_u64;
28021 + int fields_size = 0;
28023 + event = trace_file->event_call->data;
28025 + if (trace_trigger_soft_disabled(trace_file))
28028 + fields_size = event->n_u64 * sizeof(u64);
28031 + * Avoid ring buffer recursion detection, as this event
28032 + * is being performed within another event.
28034 + buffer = trace_file->tr->trace_buffer.buffer;
28035 + ring_buffer_nest_start(buffer);
28037 + entry = trace_event_buffer_reserve(&fbuffer, trace_file,
28038 + sizeof(*entry) + fields_size);
28042 + for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
28043 + if (event->fields[i]->is_string) {
28044 + char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i];
28045 + char *str_field = (char *)&entry->fields[n_u64];
28047 + strscpy(str_field, str_val, STR_VAR_LEN_MAX);
28048 + n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
28050 + entry->fields[n_u64] = var_ref_vals[var_ref_idx + i];
28055 + trace_event_buffer_commit(&fbuffer);
28057 + ring_buffer_nest_end(buffer);
28060 +static void free_synth_event_print_fmt(struct trace_event_call *call)
28063 + kfree(call->print_fmt);
28064 + call->print_fmt = NULL;
28068 +static int __set_synth_event_print_fmt(struct synth_event *event,
28069 + char *buf, int len)
28075 + /* When len=0, we just calculate the needed length */
28076 +#define LEN_OR_ZERO (len ? len - pos : 0)
28078 + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
28079 + for (i = 0; i < event->n_fields; i++) {
28080 + fmt = synth_field_fmt(event->fields[i]->type);
28081 + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s",
28082 + event->fields[i]->name, fmt,
28083 + i == event->n_fields - 1 ? "" : ", ");
28085 + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
28087 + for (i = 0; i < event->n_fields; i++) {
28088 + pos += snprintf(buf + pos, LEN_OR_ZERO,
28089 + ", REC->%s", event->fields[i]->name);
28092 +#undef LEN_OR_ZERO
28094 + /* return the length of print_fmt */
28098 +static int set_synth_event_print_fmt(struct trace_event_call *call)
28100 + struct synth_event *event = call->data;
28104 + /* First: called with 0 length to calculate the needed length */
28105 + len = __set_synth_event_print_fmt(event, NULL, 0);
28107 + print_fmt = kmalloc(len + 1, GFP_KERNEL);
28111 + /* Second: actually write the @print_fmt */
28112 + __set_synth_event_print_fmt(event, print_fmt, len + 1);
28113 + call->print_fmt = print_fmt;
28118 +static void free_synth_field(struct synth_field *field)
28120 + kfree(field->type);
28121 + kfree(field->name);
28125 +static struct synth_field *parse_synth_field(char *field_type,
28126 + char *field_name)
28128 + struct synth_field *field;
28129 + int len, ret = 0;
28132 + if (field_type[0] == ';')
28135 + len = strlen(field_name);
28136 + if (field_name[len - 1] == ';')
28137 + field_name[len - 1] = '\0';
28139 + field = kzalloc(sizeof(*field), GFP_KERNEL);
28141 + return ERR_PTR(-ENOMEM);
28143 + len = strlen(field_type) + 1;
28144 + array = strchr(field_name, '[');
28146 + len += strlen(array);
28147 + field->type = kzalloc(len, GFP_KERNEL);
28148 + if (!field->type) {
28152 + strcat(field->type, field_type);
28154 + strcat(field->type, array);
28158 + field->size = synth_field_size(field->type);
28159 + if (!field->size) {
28164 + if (synth_field_is_string(field->type))
28165 + field->is_string = true;
28167 + field->is_signed = synth_field_signed(field->type);
28169 + field->name = kstrdup(field_name, GFP_KERNEL);
28170 + if (!field->name) {
28177 + free_synth_field(field);
28178 + field = ERR_PTR(ret);
28182 +static void free_synth_tracepoint(struct tracepoint *tp)
28191 +static struct tracepoint *alloc_synth_tracepoint(char *name)
28193 + struct tracepoint *tp;
28195 + tp = kzalloc(sizeof(*tp), GFP_KERNEL);
28197 + return ERR_PTR(-ENOMEM);
28199 + tp->name = kstrdup(name, GFP_KERNEL);
28202 + return ERR_PTR(-ENOMEM);
28208 +typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals,
28209 + unsigned int var_ref_idx);
28211 +static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
28212 + unsigned int var_ref_idx)
28214 + struct tracepoint *tp = event->tp;
28216 + if (unlikely(atomic_read(&tp->key.enabled) > 0)) {
28217 + struct tracepoint_func *probe_func_ptr;
28218 + synth_probe_func_t probe_func;
28221 + if (!(cpu_online(raw_smp_processor_id())))
28224 + probe_func_ptr = rcu_dereference_sched((tp)->funcs);
28225 + if (probe_func_ptr) {
28227 + probe_func = probe_func_ptr->func;
28228 + __data = probe_func_ptr->data;
28229 + probe_func(__data, var_ref_vals, var_ref_idx);
28230 + } while ((++probe_func_ptr)->func);
28235 +static struct synth_event *find_synth_event(const char *name)
28237 + struct synth_event *event;
28239 + list_for_each_entry(event, &synth_event_list, list) {
28240 + if (strcmp(event->name, name) == 0)
28247 +static int register_synth_event(struct synth_event *event)
28249 + struct trace_event_call *call = &event->call;
28252 + event->call.class = &event->class;
28253 + event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL);
28254 + if (!event->class.system) {
28259 + event->tp = alloc_synth_tracepoint(event->name);
28260 + if (IS_ERR(event->tp)) {
28261 + ret = PTR_ERR(event->tp);
28262 + event->tp = NULL;
28266 + INIT_LIST_HEAD(&call->class->fields);
28267 + call->event.funcs = &synth_event_funcs;
28268 + call->class->define_fields = synth_event_define_fields;
28270 + ret = register_trace_event(&call->event);
28275 + call->flags = TRACE_EVENT_FL_TRACEPOINT;
28276 + call->class->reg = trace_event_reg;
28277 + call->class->probe = trace_event_raw_event_synth;
28278 + call->data = event;
28279 + call->tp = event->tp;
28281 + ret = trace_add_event_call(call);
28283 + pr_warn("Failed to register synthetic event: %s\n",
28284 + trace_event_name(call));
28288 + ret = set_synth_event_print_fmt(call);
28290 + trace_remove_event_call(call);
28296 + unregister_trace_event(&call->event);
28300 +static int unregister_synth_event(struct synth_event *event)
28302 + struct trace_event_call *call = &event->call;
28305 + ret = trace_remove_event_call(call);
28310 +static void free_synth_event(struct synth_event *event)
28317 + for (i = 0; i < event->n_fields; i++)
28318 + free_synth_field(event->fields[i]);
28320 + kfree(event->fields);
28321 + kfree(event->name);
28322 + kfree(event->class.system);
28323 + free_synth_tracepoint(event->tp);
28324 + free_synth_event_print_fmt(&event->call);
28328 +static struct synth_event *alloc_synth_event(char *event_name, int n_fields,
28329 + struct synth_field **fields)
28331 + struct synth_event *event;
28334 + event = kzalloc(sizeof(*event), GFP_KERNEL);
28336 + event = ERR_PTR(-ENOMEM);
28340 + event->name = kstrdup(event_name, GFP_KERNEL);
28341 + if (!event->name) {
28343 + event = ERR_PTR(-ENOMEM);
28347 + event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL);
28348 + if (!event->fields) {
28349 + free_synth_event(event);
28350 + event = ERR_PTR(-ENOMEM);
28354 + for (i = 0; i < n_fields; i++)
28355 + event->fields[i] = fields[i];
28357 + event->n_fields = n_fields;
28362 +static void action_trace(struct hist_trigger_data *hist_data,
28363 + struct tracing_map_elt *elt, void *rec,
28364 + struct ring_buffer_event *rbe,
28365 + struct action_data *data, u64 *var_ref_vals)
28367 + struct synth_event *event = data->onmatch.synth_event;
28369 + trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx);
28372 +struct hist_var_data {
28373 + struct list_head list;
28374 + struct hist_trigger_data *hist_data;
28377 +static void add_or_delete_synth_event(struct synth_event *event, int delete)
28380 + free_synth_event(event);
28382 + mutex_lock(&synth_event_mutex);
28383 + if (!find_synth_event(event->name))
28384 + list_add(&event->list, &synth_event_list);
28386 + free_synth_event(event);
28387 + mutex_unlock(&synth_event_mutex);
28391 +static int create_synth_event(int argc, char **argv)
28393 + struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
28394 + struct synth_event *event = NULL;
28395 + bool delete_event = false;
28396 + int i, n_fields = 0, ret = 0;
28399 + mutex_lock(&synth_event_mutex);
28402 + * Argument syntax:
28403 + * - Add synthetic event: <event_name> field[;field] ...
28404 + * - Remove synthetic event: !<event_name> field[;field] ...
28405 + * where 'field' = type field_name
28413 + if (name[0] == '!') {
28414 + delete_event = true;
28418 + event = find_synth_event(name);
28420 + if (delete_event) {
28421 + if (event->ref) {
28426 + list_del(&event->list);
28432 + } else if (delete_event)
28440 + for (i = 1; i < argc - 1; i++) {
28441 + if (strcmp(argv[i], ";") == 0)
28443 + if (n_fields == SYNTH_FIELDS_MAX) {
28448 + field = parse_synth_field(argv[i], argv[i + 1]);
28449 + if (IS_ERR(field)) {
28450 + ret = PTR_ERR(field);
28453 + fields[n_fields] = field;
28462 + event = alloc_synth_event(name, n_fields, fields);
28463 + if (IS_ERR(event)) {
28464 + ret = PTR_ERR(event);
28469 + mutex_unlock(&synth_event_mutex);
28472 + if (delete_event) {
28473 + ret = unregister_synth_event(event);
28474 + add_or_delete_synth_event(event, !ret);
28476 + ret = register_synth_event(event);
28477 + add_or_delete_synth_event(event, ret);
28483 + mutex_unlock(&synth_event_mutex);
28485 + for (i = 0; i < n_fields; i++)
28486 + free_synth_field(fields[i]);
28487 + free_synth_event(event);
28492 +static int release_all_synth_events(void)
28494 + struct list_head release_events;
28495 + struct synth_event *event, *e;
28498 + INIT_LIST_HEAD(&release_events);
28500 + mutex_lock(&synth_event_mutex);
28502 + list_for_each_entry(event, &synth_event_list, list) {
28503 + if (event->ref) {
28504 + mutex_unlock(&synth_event_mutex);
28509 + list_splice_init(&event->list, &release_events);
28511 + mutex_unlock(&synth_event_mutex);
28513 + list_for_each_entry_safe(event, e, &release_events, list) {
28514 + list_del(&event->list);
28516 + ret = unregister_synth_event(event);
28517 + add_or_delete_synth_event(event, !ret);
28524 +static void *synth_events_seq_start(struct seq_file *m, loff_t *pos)
28526 + mutex_lock(&synth_event_mutex);
28528 + return seq_list_start(&synth_event_list, *pos);
28531 +static void *synth_events_seq_next(struct seq_file *m, void *v, loff_t *pos)
28533 + return seq_list_next(v, &synth_event_list, pos);
28536 +static void synth_events_seq_stop(struct seq_file *m, void *v)
28538 + mutex_unlock(&synth_event_mutex);
28541 +static int synth_events_seq_show(struct seq_file *m, void *v)
28543 + struct synth_field *field;
28544 + struct synth_event *event = v;
28547 + seq_printf(m, "%s\t", event->name);
28549 + for (i = 0; i < event->n_fields; i++) {
28550 + field = event->fields[i];
28552 + /* parameter values */
28553 + seq_printf(m, "%s %s%s", field->type, field->name,
28554 + i == event->n_fields - 1 ? "" : "; ");
28557 + seq_putc(m, '\n');
28562 +static const struct seq_operations synth_events_seq_op = {
28563 + .start = synth_events_seq_start,
28564 + .next = synth_events_seq_next,
28565 + .stop = synth_events_seq_stop,
28566 + .show = synth_events_seq_show
28569 +static int synth_events_open(struct inode *inode, struct file *file)
28573 + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
28574 + ret = release_all_synth_events();
28579 + return seq_open(file, &synth_events_seq_op);
28582 +static ssize_t synth_events_write(struct file *file,
28583 + const char __user *buffer,
28584 + size_t count, loff_t *ppos)
28586 + return trace_parse_run_command(file, buffer, count, ppos,
28587 + create_synth_event);
28590 +static const struct file_operations synth_events_fops = {
28591 + .open = synth_events_open,
28592 + .write = synth_events_write,
28593 + .read = seq_read,
28594 + .llseek = seq_lseek,
28595 + .release = seq_release,
28598 +static u64 hist_field_timestamp(struct hist_field *hist_field,
28599 + struct tracing_map_elt *elt,
28600 + struct ring_buffer_event *rbe,
28603 + struct hist_trigger_data *hist_data = hist_field->hist_data;
28604 + struct trace_array *tr = hist_data->event_file->tr;
28606 + u64 ts = ring_buffer_event_time_stamp(rbe);
28608 + if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr))
28609 + ts = ns2usecs(ts);
28614 +static u64 hist_field_cpu(struct hist_field *hist_field,
28615 + struct tracing_map_elt *elt,
28616 + struct ring_buffer_event *rbe,
28619 + int cpu = smp_processor_id();
28624 +static struct hist_field *
28625 +check_field_for_var_ref(struct hist_field *hist_field,
28626 + struct hist_trigger_data *var_data,
28627 + unsigned int var_idx)
28629 + struct hist_field *found = NULL;
28631 + if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF) {
28632 + if (hist_field->var.idx == var_idx &&
28633 + hist_field->var.hist_data == var_data) {
28634 + found = hist_field;
28641 +static struct hist_field *
28642 +check_field_for_var_refs(struct hist_trigger_data *hist_data,
28643 + struct hist_field *hist_field,
28644 + struct hist_trigger_data *var_data,
28645 + unsigned int var_idx,
28646 + unsigned int level)
28648 + struct hist_field *found = NULL;
28657 + found = check_field_for_var_ref(hist_field, var_data, var_idx);
28661 + for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
28662 + struct hist_field *operand;
28664 + operand = hist_field->operands[i];
28665 + found = check_field_for_var_refs(hist_data, operand, var_data,
28666 + var_idx, level + 1);
28674 +static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data,
28675 + struct hist_trigger_data *var_data,
28676 + unsigned int var_idx)
28678 + struct hist_field *hist_field, *found = NULL;
28681 + for_each_hist_field(i, hist_data) {
28682 + hist_field = hist_data->fields[i];
28683 + found = check_field_for_var_refs(hist_data, hist_field,
28684 + var_data, var_idx, 0);
28689 + for (i = 0; i < hist_data->n_synth_var_refs; i++) {
28690 + hist_field = hist_data->synth_var_refs[i];
28691 + found = check_field_for_var_refs(hist_data, hist_field,
28692 + var_data, var_idx, 0);
28700 +static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data,
28701 + unsigned int var_idx)
28703 + struct trace_array *tr = hist_data->event_file->tr;
28704 + struct hist_field *found = NULL;
28705 + struct hist_var_data *var_data;
28707 + list_for_each_entry(var_data, &tr->hist_vars, list) {
28708 + if (var_data->hist_data == hist_data)
28710 + found = find_var_ref(var_data->hist_data, hist_data, var_idx);
28718 +static bool check_var_refs(struct hist_trigger_data *hist_data)
28720 + struct hist_field *field;
28721 + bool found = false;
28724 + for_each_hist_field(i, hist_data) {
28725 + field = hist_data->fields[i];
28726 + if (field && field->flags & HIST_FIELD_FL_VAR) {
28727 + if (find_any_var_ref(hist_data, field->var.idx)) {
28737 +static struct hist_var_data *find_hist_vars(struct hist_trigger_data *hist_data)
28739 + struct trace_array *tr = hist_data->event_file->tr;
28740 + struct hist_var_data *var_data, *found = NULL;
28742 + list_for_each_entry(var_data, &tr->hist_vars, list) {
28743 + if (var_data->hist_data == hist_data) {
28744 + found = var_data;
28752 +static bool field_has_hist_vars(struct hist_field *hist_field,
28753 + unsigned int level)
28763 + if (hist_field->flags & HIST_FIELD_FL_VAR ||
28764 + hist_field->flags & HIST_FIELD_FL_VAR_REF)
28767 + for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
28768 + struct hist_field *operand;
28770 + operand = hist_field->operands[i];
28771 + if (field_has_hist_vars(operand, level + 1))
28778 +static bool has_hist_vars(struct hist_trigger_data *hist_data)
28780 + struct hist_field *hist_field;
28783 + for_each_hist_field(i, hist_data) {
28784 + hist_field = hist_data->fields[i];
28785 + if (field_has_hist_vars(hist_field, 0))
28792 +static int save_hist_vars(struct hist_trigger_data *hist_data)
28794 + struct trace_array *tr = hist_data->event_file->tr;
28795 + struct hist_var_data *var_data;
28797 + var_data = find_hist_vars(hist_data);
28801 + if (trace_array_get(tr) < 0)
28804 + var_data = kzalloc(sizeof(*var_data), GFP_KERNEL);
28806 + trace_array_put(tr);
28810 + var_data->hist_data = hist_data;
28811 + list_add(&var_data->list, &tr->hist_vars);
28816 +static void remove_hist_vars(struct hist_trigger_data *hist_data)
28818 + struct trace_array *tr = hist_data->event_file->tr;
28819 + struct hist_var_data *var_data;
28821 + var_data = find_hist_vars(hist_data);
28825 + if (WARN_ON(check_var_refs(hist_data)))
28828 + list_del(&var_data->list);
28832 + trace_array_put(tr);
28835 +static struct hist_field *find_var_field(struct hist_trigger_data *hist_data,
28836 + const char *var_name)
28838 + struct hist_field *hist_field, *found = NULL;
28841 + for_each_hist_field(i, hist_data) {
28842 + hist_field = hist_data->fields[i];
28843 + if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR &&
28844 + strcmp(hist_field->var.name, var_name) == 0) {
28845 + found = hist_field;
28853 +static struct hist_field *find_var(struct hist_trigger_data *hist_data,
28854 + struct trace_event_file *file,
28855 + const char *var_name)
28857 + struct hist_trigger_data *test_data;
28858 + struct event_trigger_data *test;
28859 + struct hist_field *hist_field;
28861 + hist_field = find_var_field(hist_data, var_name);
28863 + return hist_field;
28865 + list_for_each_entry_rcu(test, &file->triggers, list) {
28866 + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
28867 + test_data = test->private_data;
28868 + hist_field = find_var_field(test_data, var_name);
28870 + return hist_field;
28877 +static struct trace_event_file *find_var_file(struct trace_array *tr,
28879 + char *event_name,
28882 + struct hist_trigger_data *var_hist_data;
28883 + struct hist_var_data *var_data;
28884 + struct trace_event_file *file, *found = NULL;
28887 + return find_event_file(tr, system, event_name);
28889 + list_for_each_entry(var_data, &tr->hist_vars, list) {
28890 + var_hist_data = var_data->hist_data;
28891 + file = var_hist_data->event_file;
28892 + if (file == found)
28895 + if (find_var_field(var_hist_data, var_name)) {
28897 + hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
28908 +static struct hist_field *find_file_var(struct trace_event_file *file,
28909 + const char *var_name)
28911 + struct hist_trigger_data *test_data;
28912 + struct event_trigger_data *test;
28913 + struct hist_field *hist_field;
28915 + list_for_each_entry_rcu(test, &file->triggers, list) {
28916 + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
28917 + test_data = test->private_data;
28918 + hist_field = find_var_field(test_data, var_name);
28920 + return hist_field;
28927 +static struct hist_field *
28928 +find_match_var(struct hist_trigger_data *hist_data, char *var_name)
28930 + struct trace_array *tr = hist_data->event_file->tr;
28931 + struct hist_field *hist_field, *found = NULL;
28932 + struct trace_event_file *file;
28935 + for (i = 0; i < hist_data->n_actions; i++) {
28936 + struct action_data *data = hist_data->actions[i];
28938 + if (data->fn == action_trace) {
28939 + char *system = data->onmatch.match_event_system;
28940 + char *event_name = data->onmatch.match_event;
28942 + file = find_var_file(tr, system, event_name, var_name);
28945 + hist_field = find_file_var(file, var_name);
28946 + if (hist_field) {
28948 + hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
28949 + return ERR_PTR(-EINVAL);
28952 + found = hist_field;
28959 +static struct hist_field *find_event_var(struct hist_trigger_data *hist_data,
28961 + char *event_name,
28964 + struct trace_array *tr = hist_data->event_file->tr;
28965 + struct hist_field *hist_field = NULL;
28966 + struct trace_event_file *file;
28968 + if (!system || !event_name) {
28969 + hist_field = find_match_var(hist_data, var_name);
28970 + if (IS_ERR(hist_field))
28973 + return hist_field;
28976 + file = find_var_file(tr, system, event_name, var_name);
28980 + hist_field = find_file_var(file, var_name);
28982 + return hist_field;
28985 +struct hist_elt_data {
28987 + u64 *var_ref_vals;
28988 + char *field_var_str[SYNTH_FIELDS_MAX];
28991 +static u64 hist_field_var_ref(struct hist_field *hist_field,
28992 + struct tracing_map_elt *elt,
28993 + struct ring_buffer_event *rbe,
28996 + struct hist_elt_data *elt_data;
28999 + elt_data = elt->private_data;
29000 + var_val = elt_data->var_ref_vals[hist_field->var_ref_idx];
29005 +static bool resolve_var_refs(struct hist_trigger_data *hist_data, void *key,
29006 + u64 *var_ref_vals, bool self)
29008 + struct hist_trigger_data *var_data;
29009 + struct tracing_map_elt *var_elt;
29010 + struct hist_field *hist_field;
29011 + unsigned int i, var_idx;
29012 + bool resolved = true;
29015 + for (i = 0; i < hist_data->n_var_refs; i++) {
29016 + hist_field = hist_data->var_refs[i];
29017 + var_idx = hist_field->var.idx;
29018 + var_data = hist_field->var.hist_data;
29020 + if (var_data == NULL) {
29021 + resolved = false;
29025 + if ((self && var_data != hist_data) ||
29026 + (!self && var_data == hist_data))
29029 + var_elt = tracing_map_lookup(var_data->map, key);
29031 + resolved = false;
29035 + if (!tracing_map_var_set(var_elt, var_idx)) {
29036 + resolved = false;
29040 + if (self || !hist_field->read_once)
29041 + var_val = tracing_map_read_var(var_elt, var_idx);
29043 + var_val = tracing_map_read_var_once(var_elt, var_idx);
29045 + var_ref_vals[i] = var_val;
29051 +static const char *hist_field_name(struct hist_field *field,
29052 + unsigned int level)
29054 + const char *field_name = "";
29057 + return field_name;
29059 + if (field->field)
29060 + field_name = field->field->name;
29061 + else if (field->flags & HIST_FIELD_FL_LOG2 ||
29062 + field->flags & HIST_FIELD_FL_ALIAS)
29063 + field_name = hist_field_name(field->operands[0], ++level);
29064 + else if (field->flags & HIST_FIELD_FL_CPU)
29065 + field_name = "cpu";
29066 + else if (field->flags & HIST_FIELD_FL_EXPR ||
29067 + field->flags & HIST_FIELD_FL_VAR_REF) {
29068 + if (field->system) {
29069 + static char full_name[MAX_FILTER_STR_VAL];
29071 + strcat(full_name, field->system);
29072 + strcat(full_name, ".");
29073 + strcat(full_name, field->event_name);
29074 + strcat(full_name, ".");
29075 + strcat(full_name, field->name);
29076 + field_name = full_name;
29078 + field_name = field->name;
29079 + } else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
29080 + field_name = "common_timestamp";
29082 + if (field_name == NULL)
29085 + return field_name;
29088 static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
29090 hist_field_fn_t fn = NULL;
29091 @@ -207,16 +1771,119 @@
29093 static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
29100 + for (i = 0; i < attrs->n_assignments; i++)
29101 + kfree(attrs->assignment_str[i]);
29103 + for (i = 0; i < attrs->n_actions; i++)
29104 + kfree(attrs->action_str[i]);
29106 kfree(attrs->name);
29107 kfree(attrs->sort_key_str);
29108 kfree(attrs->keys_str);
29109 kfree(attrs->vals_str);
29110 + kfree(attrs->clock);
29114 +static int parse_action(char *str, struct hist_trigger_attrs *attrs)
29116 + int ret = -EINVAL;
29118 + if (attrs->n_actions >= HIST_ACTIONS_MAX)
29121 + if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0) ||
29122 + (strncmp(str, "onmax(", strlen("onmax(")) == 0)) {
29123 + attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL);
29124 + if (!attrs->action_str[attrs->n_actions]) {
29128 + attrs->n_actions++;
29135 +static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
29139 + if ((strncmp(str, "key=", strlen("key=")) == 0) ||
29140 + (strncmp(str, "keys=", strlen("keys=")) == 0)) {
29141 + attrs->keys_str = kstrdup(str, GFP_KERNEL);
29142 + if (!attrs->keys_str) {
29146 + } else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
29147 + (strncmp(str, "vals=", strlen("vals=")) == 0) ||
29148 + (strncmp(str, "values=", strlen("values=")) == 0)) {
29149 + attrs->vals_str = kstrdup(str, GFP_KERNEL);
29150 + if (!attrs->vals_str) {
29154 + } else if (strncmp(str, "sort=", strlen("sort=")) == 0) {
29155 + attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
29156 + if (!attrs->sort_key_str) {
29160 + } else if (strncmp(str, "name=", strlen("name=")) == 0) {
29161 + attrs->name = kstrdup(str, GFP_KERNEL);
29162 + if (!attrs->name) {
29166 + } else if (strncmp(str, "clock=", strlen("clock=")) == 0) {
29167 + strsep(&str, "=");
29173 + str = strstrip(str);
29174 + attrs->clock = kstrdup(str, GFP_KERNEL);
29175 + if (!attrs->clock) {
29179 + } else if (strncmp(str, "size=", strlen("size=")) == 0) {
29180 + int map_bits = parse_map_size(str);
29182 + if (map_bits < 0) {
29186 + attrs->map_bits = map_bits;
29188 + char *assignment;
29190 + if (attrs->n_assignments == TRACING_MAP_VARS_MAX) {
29191 + hist_err("Too many variables defined: ", str);
29196 + assignment = kstrdup(str, GFP_KERNEL);
29197 + if (!assignment) {
29202 + attrs->assignment_str[attrs->n_assignments++] = assignment;
29208 static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
29210 struct hist_trigger_attrs *attrs;
29211 @@ -229,35 +1896,21 @@
29212 while (trigger_str) {
29213 char *str = strsep(&trigger_str, ":");
29215 - if ((strncmp(str, "key=", strlen("key=")) == 0) ||
29216 - (strncmp(str, "keys=", strlen("keys=")) == 0))
29217 - attrs->keys_str = kstrdup(str, GFP_KERNEL);
29218 - else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
29219 - (strncmp(str, "vals=", strlen("vals=")) == 0) ||
29220 - (strncmp(str, "values=", strlen("values=")) == 0))
29221 - attrs->vals_str = kstrdup(str, GFP_KERNEL);
29222 - else if (strncmp(str, "sort=", strlen("sort=")) == 0)
29223 - attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
29224 - else if (strncmp(str, "name=", strlen("name=")) == 0)
29225 - attrs->name = kstrdup(str, GFP_KERNEL);
29226 - else if (strcmp(str, "pause") == 0)
29227 + if (strchr(str, '=')) {
29228 + ret = parse_assignment(str, attrs);
29231 + } else if (strcmp(str, "pause") == 0)
29232 attrs->pause = true;
29233 else if ((strcmp(str, "cont") == 0) ||
29234 (strcmp(str, "continue") == 0))
29235 attrs->cont = true;
29236 else if (strcmp(str, "clear") == 0)
29237 attrs->clear = true;
29238 - else if (strncmp(str, "size=", strlen("size=")) == 0) {
29239 - int map_bits = parse_map_size(str);
29241 - if (map_bits < 0) {
29244 + ret = parse_action(str, attrs);
29248 - attrs->map_bits = map_bits;
29255 @@ -266,6 +1919,14 @@
29259 + if (!attrs->clock) {
29260 + attrs->clock = kstrdup("global", GFP_KERNEL);
29261 + if (!attrs->clock) {
29269 destroy_hist_trigger_attrs(attrs);
29270 @@ -288,65 +1949,222 @@
29271 memcpy(comm, task->comm, TASK_COMM_LEN);
29274 -static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt)
29275 +static void hist_elt_data_free(struct hist_elt_data *elt_data)
29277 - kfree((char *)elt->private_data);
29280 + for (i = 0; i < SYNTH_FIELDS_MAX; i++)
29281 + kfree(elt_data->field_var_str[i]);
29283 + kfree(elt_data->comm);
29287 -static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt)
29288 +static void hist_trigger_elt_data_free(struct tracing_map_elt *elt)
29290 + struct hist_elt_data *elt_data = elt->private_data;
29292 + hist_elt_data_free(elt_data);
29295 +static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
29297 struct hist_trigger_data *hist_data = elt->map->private_data;
29298 + unsigned int size = TASK_COMM_LEN;
29299 + struct hist_elt_data *elt_data;
29300 struct hist_field *key_field;
29302 + unsigned int i, n_str;
29304 + elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL);
29308 for_each_hist_key_field(i, hist_data) {
29309 key_field = hist_data->fields[i];
29311 if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
29312 - unsigned int size = TASK_COMM_LEN + 1;
29314 - elt->private_data = kzalloc(size, GFP_KERNEL);
29315 - if (!elt->private_data)
29316 + elt_data->comm = kzalloc(size, GFP_KERNEL);
29317 + if (!elt_data->comm) {
29325 + n_str = hist_data->n_field_var_str + hist_data->n_max_var_str;
29327 + size = STR_VAR_LEN_MAX;
29329 + for (i = 0; i < n_str; i++) {
29330 + elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL);
29331 + if (!elt_data->field_var_str[i]) {
29332 + hist_elt_data_free(elt_data);
29337 + elt->private_data = elt_data;
29342 -static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to,
29343 - struct tracing_map_elt *from)
29344 +static void hist_trigger_elt_data_init(struct tracing_map_elt *elt)
29346 - char *comm_from = from->private_data;
29347 - char *comm_to = to->private_data;
29348 + struct hist_elt_data *elt_data = elt->private_data;
29351 - memcpy(comm_to, comm_from, TASK_COMM_LEN + 1);
29352 + if (elt_data->comm)
29353 + save_comm(elt_data->comm, current);
29356 -static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt)
29357 +static const struct tracing_map_ops hist_trigger_elt_data_ops = {
29358 + .elt_alloc = hist_trigger_elt_data_alloc,
29359 + .elt_free = hist_trigger_elt_data_free,
29360 + .elt_init = hist_trigger_elt_data_init,
29363 +static const char *get_hist_field_flags(struct hist_field *hist_field)
29365 - char *comm = elt->private_data;
29366 + const char *flags_str = NULL;
29369 - save_comm(comm, current);
29370 + if (hist_field->flags & HIST_FIELD_FL_HEX)
29371 + flags_str = "hex";
29372 + else if (hist_field->flags & HIST_FIELD_FL_SYM)
29373 + flags_str = "sym";
29374 + else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
29375 + flags_str = "sym-offset";
29376 + else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
29377 + flags_str = "execname";
29378 + else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
29379 + flags_str = "syscall";
29380 + else if (hist_field->flags & HIST_FIELD_FL_LOG2)
29381 + flags_str = "log2";
29382 + else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS)
29383 + flags_str = "usecs";
29385 + return flags_str;
29388 -static const struct tracing_map_ops hist_trigger_elt_comm_ops = {
29389 - .elt_alloc = hist_trigger_elt_comm_alloc,
29390 - .elt_copy = hist_trigger_elt_comm_copy,
29391 - .elt_free = hist_trigger_elt_comm_free,
29392 - .elt_init = hist_trigger_elt_comm_init,
29394 +static void expr_field_str(struct hist_field *field, char *expr)
29396 + if (field->flags & HIST_FIELD_FL_VAR_REF)
29397 + strcat(expr, "$");
29399 -static void destroy_hist_field(struct hist_field *hist_field)
29400 + strcat(expr, hist_field_name(field, 0));
29402 + if (field->flags && !(field->flags & HIST_FIELD_FL_VAR_REF)) {
29403 + const char *flags_str = get_hist_field_flags(field);
29406 + strcat(expr, ".");
29407 + strcat(expr, flags_str);
29412 +static char *expr_str(struct hist_field *field, unsigned int level)
29419 + expr = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
29423 + if (!field->operands[0]) {
29424 + expr_field_str(field, expr);
29428 + if (field->operator == FIELD_OP_UNARY_MINUS) {
29431 + strcat(expr, "-(");
29432 + subexpr = expr_str(field->operands[0], ++level);
29437 + strcat(expr, subexpr);
29438 + strcat(expr, ")");
29445 + expr_field_str(field->operands[0], expr);
29447 + switch (field->operator) {
29448 + case FIELD_OP_MINUS:
29449 + strcat(expr, "-");
29451 + case FIELD_OP_PLUS:
29452 + strcat(expr, "+");
29459 + expr_field_str(field->operands[1], expr);
29464 +static int contains_operator(char *str)
29466 + enum field_op_id field_op = FIELD_OP_NONE;
29469 + op = strpbrk(str, "+-");
29471 + return FIELD_OP_NONE;
29476 + field_op = FIELD_OP_UNARY_MINUS;
29478 + field_op = FIELD_OP_MINUS;
29481 + field_op = FIELD_OP_PLUS;
29490 +static void destroy_hist_field(struct hist_field *hist_field,
29491 + unsigned int level)
29501 + for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++)
29502 + destroy_hist_field(hist_field->operands[i], level + 1);
29504 + kfree(hist_field->var.name);
29505 + kfree(hist_field->name);
29506 + kfree(hist_field->type);
29511 -static struct hist_field *create_hist_field(struct ftrace_event_field *field,
29512 - unsigned long flags)
29513 +static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
29514 + struct ftrace_event_field *field,
29515 + unsigned long flags,
29518 struct hist_field *hist_field;
29520 @@ -357,8 +2175,22 @@
29524 + hist_field->hist_data = hist_data;
29526 + if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS)
29527 + goto out; /* caller will populate */
29529 + if (flags & HIST_FIELD_FL_VAR_REF) {
29530 + hist_field->fn = hist_field_var_ref;
29534 if (flags & HIST_FIELD_FL_HITCOUNT) {
29535 hist_field->fn = hist_field_counter;
29536 + hist_field->size = sizeof(u64);
29537 + hist_field->type = kstrdup("u64", GFP_KERNEL);
29538 + if (!hist_field->type)
29543 @@ -368,7 +2200,31 @@
29546 if (flags & HIST_FIELD_FL_LOG2) {
29547 + unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
29548 hist_field->fn = hist_field_log2;
29549 + hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
29550 + hist_field->size = hist_field->operands[0]->size;
29551 + hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL);
29552 + if (!hist_field->type)
29557 + if (flags & HIST_FIELD_FL_TIMESTAMP) {
29558 + hist_field->fn = hist_field_timestamp;
29559 + hist_field->size = sizeof(u64);
29560 + hist_field->type = kstrdup("u64", GFP_KERNEL);
29561 + if (!hist_field->type)
29566 + if (flags & HIST_FIELD_FL_CPU) {
29567 + hist_field->fn = hist_field_cpu;
29568 + hist_field->size = sizeof(int);
29569 + hist_field->type = kstrdup("unsigned int", GFP_KERNEL);
29570 + if (!hist_field->type)
29575 @@ -378,6 +2234,11 @@
29576 if (is_string_field(field)) {
29577 flags |= HIST_FIELD_FL_STRING;
29579 + hist_field->size = MAX_FILTER_STR_VAL;
29580 + hist_field->type = kstrdup(field->type, GFP_KERNEL);
29581 + if (!hist_field->type)
29584 if (field->filter_type == FILTER_STATIC_STRING)
29585 hist_field->fn = hist_field_string;
29586 else if (field->filter_type == FILTER_DYN_STRING)
29587 @@ -385,10 +2246,16 @@
29589 hist_field->fn = hist_field_pstring;
29591 + hist_field->size = field->size;
29592 + hist_field->is_signed = field->is_signed;
29593 + hist_field->type = kstrdup(field->type, GFP_KERNEL);
29594 + if (!hist_field->type)
29597 hist_field->fn = select_value_fn(field->size,
29599 if (!hist_field->fn) {
29600 - destroy_hist_field(hist_field);
29601 + destroy_hist_field(hist_field, 0);
29605 @@ -396,84 +2263,1636 @@
29606 hist_field->field = field;
29607 hist_field->flags = flags;
29610 + hist_field->var.name = kstrdup(var_name, GFP_KERNEL);
29611 + if (!hist_field->var.name)
29617 + destroy_hist_field(hist_field, 0);
29621 static void destroy_hist_fields(struct hist_trigger_data *hist_data)
29625 - for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
29626 + for (i = 0; i < HIST_FIELDS_MAX; i++) {
29627 if (hist_data->fields[i]) {
29628 - destroy_hist_field(hist_data->fields[i]);
29629 + destroy_hist_field(hist_data->fields[i], 0);
29630 hist_data->fields[i] = NULL;
29635 -static int create_hitcount_val(struct hist_trigger_data *hist_data)
29636 +static int init_var_ref(struct hist_field *ref_field,
29637 + struct hist_field *var_field,
29638 + char *system, char *event_name)
29640 - hist_data->fields[HITCOUNT_IDX] =
29641 - create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT);
29642 - if (!hist_data->fields[HITCOUNT_IDX])
29646 - hist_data->n_vals++;
29647 + ref_field->var.idx = var_field->var.idx;
29648 + ref_field->var.hist_data = var_field->hist_data;
29649 + ref_field->size = var_field->size;
29650 + ref_field->is_signed = var_field->is_signed;
29651 + ref_field->flags |= var_field->flags &
29652 + (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
29654 - if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
29656 + ref_field->system = kstrdup(system, GFP_KERNEL);
29657 + if (!ref_field->system)
29661 + if (event_name) {
29662 + ref_field->event_name = kstrdup(event_name, GFP_KERNEL);
29663 + if (!ref_field->event_name) {
29669 + if (var_field->var.name) {
29670 + ref_field->name = kstrdup(var_field->var.name, GFP_KERNEL);
29671 + if (!ref_field->name) {
29675 + } else if (var_field->name) {
29676 + ref_field->name = kstrdup(var_field->name, GFP_KERNEL);
29677 + if (!ref_field->name) {
29683 + ref_field->type = kstrdup(var_field->type, GFP_KERNEL);
29684 + if (!ref_field->type) {
29691 + kfree(ref_field->system);
29692 + kfree(ref_field->event_name);
29693 + kfree(ref_field->name);
29698 +static struct hist_field *create_var_ref(struct hist_field *var_field,
29699 + char *system, char *event_name)
29701 + unsigned long flags = HIST_FIELD_FL_VAR_REF;
29702 + struct hist_field *ref_field;
29704 + ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL);
29706 + if (init_var_ref(ref_field, var_field, system, event_name)) {
29707 + destroy_hist_field(ref_field, 0);
29712 + return ref_field;
29715 +static bool is_var_ref(char *var_name)
29717 + if (!var_name || strlen(var_name) < 2 || var_name[0] != '$')
29723 +static char *field_name_from_var(struct hist_trigger_data *hist_data,
29726 + char *name, *field;
29729 + for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
29730 + name = hist_data->attrs->var_defs.name[i];
29732 + if (strcmp(var_name, name) == 0) {
29733 + field = hist_data->attrs->var_defs.expr[i];
29734 + if (contains_operator(field) || is_var_ref(field))
29743 +static char *local_field_var_ref(struct hist_trigger_data *hist_data,
29744 + char *system, char *event_name,
29747 + struct trace_event_call *call;
29749 + if (system && event_name) {
29750 + call = hist_data->event_file->event_call;
29752 + if (strcmp(system, call->class->system) != 0)
29755 + if (strcmp(event_name, trace_event_name(call)) != 0)
29759 + if (!!system != !!event_name)
29762 + if (!is_var_ref(var_name))
29767 + return field_name_from_var(hist_data, var_name);
29770 +static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
29771 + char *system, char *event_name,
29774 + struct hist_field *var_field = NULL, *ref_field = NULL;
29776 + if (!is_var_ref(var_name))
29781 + var_field = find_event_var(hist_data, system, event_name, var_name);
29783 + ref_field = create_var_ref(var_field, system, event_name);
29786 + hist_err_event("Couldn't find variable: $",
29787 + system, event_name, var_name);
29789 + return ref_field;
29792 +static struct ftrace_event_field *
29793 +parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
29794 + char *field_str, unsigned long *flags)
29796 + struct ftrace_event_field *field = NULL;
29797 + char *field_name, *modifier, *str;
29799 + modifier = str = kstrdup(field_str, GFP_KERNEL);
29801 + return ERR_PTR(-ENOMEM);
29803 + field_name = strsep(&modifier, ".");
29805 + if (strcmp(modifier, "hex") == 0)
29806 + *flags |= HIST_FIELD_FL_HEX;
29807 + else if (strcmp(modifier, "sym") == 0)
29808 + *flags |= HIST_FIELD_FL_SYM;
29809 + else if (strcmp(modifier, "sym-offset") == 0)
29810 + *flags |= HIST_FIELD_FL_SYM_OFFSET;
29811 + else if ((strcmp(modifier, "execname") == 0) &&
29812 + (strcmp(field_name, "common_pid") == 0))
29813 + *flags |= HIST_FIELD_FL_EXECNAME;
29814 + else if (strcmp(modifier, "syscall") == 0)
29815 + *flags |= HIST_FIELD_FL_SYSCALL;
29816 + else if (strcmp(modifier, "log2") == 0)
29817 + *flags |= HIST_FIELD_FL_LOG2;
29818 + else if (strcmp(modifier, "usecs") == 0)
29819 + *flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
29821 + hist_err("Invalid field modifier: ", modifier);
29822 + field = ERR_PTR(-EINVAL);
29827 + if (strcmp(field_name, "common_timestamp") == 0) {
29828 + *flags |= HIST_FIELD_FL_TIMESTAMP;
29829 + hist_data->enable_timestamps = true;
29830 + if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
29831 + hist_data->attrs->ts_in_usecs = true;
29832 + } else if (strcmp(field_name, "cpu") == 0)
29833 + *flags |= HIST_FIELD_FL_CPU;
29835 + field = trace_find_event_field(file->event_call, field_name);
29836 + if (!field || !field->size) {
29837 + hist_err("Couldn't find field: ", field_name);
29838 + field = ERR_PTR(-EINVAL);
29848 +static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
29849 + struct hist_field *var_ref,
29852 + struct hist_field *alias = NULL;
29853 + unsigned long flags = HIST_FIELD_FL_ALIAS | HIST_FIELD_FL_VAR;
29855 + alias = create_hist_field(hist_data, NULL, flags, var_name);
29859 + alias->fn = var_ref->fn;
29860 + alias->operands[0] = var_ref;
29862 + if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) {
29863 + destroy_hist_field(alias, 0);
29870 +static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
29871 + struct trace_event_file *file, char *str,
29872 + unsigned long *flags, char *var_name)
29874 + char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str;
29875 + struct ftrace_event_field *field = NULL;
29876 + struct hist_field *hist_field = NULL;
29879 + s = strchr(str, '.');
29881 + s = strchr(++s, '.');
29883 + ref_system = strsep(&str, ".");
29888 + ref_event = strsep(&str, ".");
29897 + s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var);
29899 + hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var);
29900 + if (hist_field) {
29901 + hist_data->var_refs[hist_data->n_var_refs] = hist_field;
29902 + hist_field->var_ref_idx = hist_data->n_var_refs++;
29904 + hist_field = create_alias(hist_data, hist_field, var_name);
29905 + if (!hist_field) {
29910 + return hist_field;
29915 + field = parse_field(hist_data, file, str, flags);
29916 + if (IS_ERR(field)) {
29917 + ret = PTR_ERR(field);
29921 + hist_field = create_hist_field(hist_data, field, *flags, var_name);
29922 + if (!hist_field) {
29927 + return hist_field;
29929 + return ERR_PTR(ret);
29932 +static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
29933 + struct trace_event_file *file,
29934 + char *str, unsigned long flags,
29935 + char *var_name, unsigned int level);
29937 +static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
29938 + struct trace_event_file *file,
29939 + char *str, unsigned long flags,
29940 + char *var_name, unsigned int level)
29942 + struct hist_field *operand1, *expr = NULL;
29943 + unsigned long operand_flags;
29947 + // we support only -(xxx) i.e. explicit parens required
29950 + hist_err("Too many subexpressions (3 max): ", str);
29955 + str++; // skip leading '-'
29957 + s = strchr(str, '(');
29965 + s = strrchr(str, ')');
29969 + ret = -EINVAL; // no closing ')'
29973 + flags |= HIST_FIELD_FL_EXPR;
29974 + expr = create_hist_field(hist_data, NULL, flags, var_name);
29980 + operand_flags = 0;
29981 + operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
29982 + if (IS_ERR(operand1)) {
29983 + ret = PTR_ERR(operand1);
29987 + expr->flags |= operand1->flags &
29988 + (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
29989 + expr->fn = hist_field_unary_minus;
29990 + expr->operands[0] = operand1;
29991 + expr->operator = FIELD_OP_UNARY_MINUS;
29992 + expr->name = expr_str(expr, 0);
29993 + expr->type = kstrdup(operand1->type, GFP_KERNEL);
29994 + if (!expr->type) {
30001 + destroy_hist_field(expr, 0);
30002 + return ERR_PTR(ret);
30005 +static int check_expr_operands(struct hist_field *operand1,
30006 + struct hist_field *operand2)
30008 + unsigned long operand1_flags = operand1->flags;
30009 + unsigned long operand2_flags = operand2->flags;
30011 + if ((operand1_flags & HIST_FIELD_FL_VAR_REF) ||
30012 + (operand1_flags & HIST_FIELD_FL_ALIAS)) {
30013 + struct hist_field *var;
30015 + var = find_var_field(operand1->var.hist_data, operand1->name);
30018 + operand1_flags = var->flags;
30021 + if ((operand2_flags & HIST_FIELD_FL_VAR_REF) ||
30022 + (operand2_flags & HIST_FIELD_FL_ALIAS)) {
30023 + struct hist_field *var;
30025 + var = find_var_field(operand2->var.hist_data, operand2->name);
30028 + operand2_flags = var->flags;
30031 + if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) !=
30032 + (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) {
30033 + hist_err("Timestamp units in expression don't match", NULL);
30040 -static int create_val_field(struct hist_trigger_data *hist_data,
30041 - unsigned int val_idx,
30042 - struct trace_event_file *file,
30044 +static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
30045 + struct trace_event_file *file,
30046 + char *str, unsigned long flags,
30047 + char *var_name, unsigned int level)
30049 - struct ftrace_event_field *field = NULL;
30050 - unsigned long flags = 0;
30051 - char *field_name;
30052 + struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL;
30053 + unsigned long operand_flags;
30054 + int field_op, ret = -EINVAL;
30055 + char *sep, *operand1_str;
30058 + hist_err("Too many subexpressions (3 max): ", str);
30059 + return ERR_PTR(-EINVAL);
30062 + field_op = contains_operator(str);
30064 + if (field_op == FIELD_OP_NONE)
30065 + return parse_atom(hist_data, file, str, &flags, var_name);
30067 + if (field_op == FIELD_OP_UNARY_MINUS)
30068 + return parse_unary(hist_data, file, str, flags, var_name, ++level);
30070 + switch (field_op) {
30071 + case FIELD_OP_MINUS:
30074 + case FIELD_OP_PLUS:
30081 + operand1_str = strsep(&str, sep);
30082 + if (!operand1_str || !str)
30085 + operand_flags = 0;
30086 + operand1 = parse_atom(hist_data, file, operand1_str,
30087 + &operand_flags, NULL);
30088 + if (IS_ERR(operand1)) {
30089 + ret = PTR_ERR(operand1);
30094 + // rest of string could be another expression e.g. b+c in a+b+c
30095 + operand_flags = 0;
30096 + operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
30097 + if (IS_ERR(operand2)) {
30098 + ret = PTR_ERR(operand2);
30103 + ret = check_expr_operands(operand1, operand2);
30107 + flags |= HIST_FIELD_FL_EXPR;
30109 + flags |= operand1->flags &
30110 + (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
30112 + expr = create_hist_field(hist_data, NULL, flags, var_name);
30118 + operand1->read_once = true;
30119 + operand2->read_once = true;
30121 + expr->operands[0] = operand1;
30122 + expr->operands[1] = operand2;
30123 + expr->operator = field_op;
30124 + expr->name = expr_str(expr, 0);
30125 + expr->type = kstrdup(operand1->type, GFP_KERNEL);
30126 + if (!expr->type) {
30131 + switch (field_op) {
30132 + case FIELD_OP_MINUS:
30133 + expr->fn = hist_field_minus;
30135 + case FIELD_OP_PLUS:
30136 + expr->fn = hist_field_plus;
30145 + destroy_hist_field(operand1, 0);
30146 + destroy_hist_field(operand2, 0);
30147 + destroy_hist_field(expr, 0);
30149 + return ERR_PTR(ret);
30152 +static char *find_trigger_filter(struct hist_trigger_data *hist_data,
30153 + struct trace_event_file *file)
30155 + struct event_trigger_data *test;
30157 + list_for_each_entry_rcu(test, &file->triggers, list) {
30158 + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
30159 + if (test->private_data == hist_data)
30160 + return test->filter_str;
30167 +static struct event_command trigger_hist_cmd;
30168 +static int event_hist_trigger_func(struct event_command *cmd_ops,
30169 + struct trace_event_file *file,
30170 + char *glob, char *cmd, char *param);
30172 +static bool compatible_keys(struct hist_trigger_data *target_hist_data,
30173 + struct hist_trigger_data *hist_data,
30174 + unsigned int n_keys)
30176 + struct hist_field *target_hist_field, *hist_field;
30177 + unsigned int n, i, j;
30179 + if (hist_data->n_fields - hist_data->n_vals != n_keys)
30182 + i = hist_data->n_vals;
30183 + j = target_hist_data->n_vals;
30185 + for (n = 0; n < n_keys; n++) {
30186 + hist_field = hist_data->fields[i + n];
30187 + target_hist_field = target_hist_data->fields[j + n];
30189 + if (strcmp(hist_field->type, target_hist_field->type) != 0)
30191 + if (hist_field->size != target_hist_field->size)
30193 + if (hist_field->is_signed != target_hist_field->is_signed)
30200 +static struct hist_trigger_data *
30201 +find_compatible_hist(struct hist_trigger_data *target_hist_data,
30202 + struct trace_event_file *file)
30204 + struct hist_trigger_data *hist_data;
30205 + struct event_trigger_data *test;
30206 + unsigned int n_keys;
30208 + n_keys = target_hist_data->n_fields - target_hist_data->n_vals;
30210 + list_for_each_entry_rcu(test, &file->triggers, list) {
30211 + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
30212 + hist_data = test->private_data;
30214 + if (compatible_keys(target_hist_data, hist_data, n_keys))
30215 + return hist_data;
30222 +static struct trace_event_file *event_file(struct trace_array *tr,
30223 + char *system, char *event_name)
30225 + struct trace_event_file *file;
30227 + file = find_event_file(tr, system, event_name);
30229 + return ERR_PTR(-EINVAL);
30234 +static struct hist_field *
30235 +find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
30236 + char *system, char *event_name, char *field_name)
30238 + struct hist_field *event_var;
30239 + char *synthetic_name;
30241 + synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
30242 + if (!synthetic_name)
30243 + return ERR_PTR(-ENOMEM);
30245 + strcpy(synthetic_name, "synthetic_");
30246 + strcat(synthetic_name, field_name);
30248 + event_var = find_event_var(target_hist_data, system, event_name, synthetic_name);
30250 + kfree(synthetic_name);
30252 + return event_var;
30256 + * create_field_var_hist - Automatically create a histogram and var for a field
30257 + * @target_hist_data: The target hist trigger
30258 + * @subsys_name: Optional subsystem name
30259 + * @event_name: Optional event name
30260 + * @field_name: The name of the field (and the resulting variable)
30262 + * Hist trigger actions fetch data from variables, not directly from
30263 + * events. However, for convenience, users are allowed to directly
30264 + * specify an event field in an action, which will be automatically
30265 + * converted into a variable on their behalf.
30267 + * If a user specifies a field on an event that isn't the event the
30268 + * histogram currently being defined (the target event histogram), the
30269 + * only way that can be accomplished is if a new hist trigger is
30270 + * created and the field variable defined on that.
30272 + * This function creates a new histogram compatible with the target
30273 + * event (meaning a histogram with the same key as the target
30274 + * histogram), and creates a variable for the specified field, but
30275 + * with 'synthetic_' prepended to the variable name in order to avoid
30276 + * collision with normal field variables.
30278 + * Return: The variable created for the field.
30280 +static struct hist_field *
30281 +create_field_var_hist(struct hist_trigger_data *target_hist_data,
30282 + char *subsys_name, char *event_name, char *field_name)
30284 + struct trace_array *tr = target_hist_data->event_file->tr;
30285 + struct hist_field *event_var = ERR_PTR(-EINVAL);
30286 + struct hist_trigger_data *hist_data;
30287 + unsigned int i, n, first = true;
30288 + struct field_var_hist *var_hist;
30289 + struct trace_event_file *file;
30290 + struct hist_field *key_field;
30291 + char *saved_filter;
30295 + if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) {
30296 + hist_err_event("onmatch: Too many field variables defined: ",
30297 + subsys_name, event_name, field_name);
30298 + return ERR_PTR(-EINVAL);
30301 + file = event_file(tr, subsys_name, event_name);
30303 + if (IS_ERR(file)) {
30304 + hist_err_event("onmatch: Event file not found: ",
30305 + subsys_name, event_name, field_name);
30306 + ret = PTR_ERR(file);
30307 + return ERR_PTR(ret);
30311 + * Look for a histogram compatible with target. We'll use the
30312 + * found histogram specification to create a new matching
30313 + * histogram with our variable on it. target_hist_data is not
30314 + * yet a registered histogram so we can't use that.
30316 + hist_data = find_compatible_hist(target_hist_data, file);
30317 + if (!hist_data) {
30318 + hist_err_event("onmatch: Matching event histogram not found: ",
30319 + subsys_name, event_name, field_name);
30320 + return ERR_PTR(-EINVAL);
30323 + /* See if a synthetic field variable has already been created */
30324 + event_var = find_synthetic_field_var(target_hist_data, subsys_name,
30325 + event_name, field_name);
30326 + if (!IS_ERR_OR_NULL(event_var))
30327 + return event_var;
30329 + var_hist = kzalloc(sizeof(*var_hist), GFP_KERNEL);
30331 + return ERR_PTR(-ENOMEM);
30333 + cmd = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
30336 + return ERR_PTR(-ENOMEM);
30339 + /* Use the same keys as the compatible histogram */
30340 + strcat(cmd, "keys=");
30342 + for_each_hist_key_field(i, hist_data) {
30343 + key_field = hist_data->fields[i];
30345 + strcat(cmd, ",");
30346 + strcat(cmd, key_field->field->name);
30350 + /* Create the synthetic field variable specification */
30351 + strcat(cmd, ":synthetic_");
30352 + strcat(cmd, field_name);
30353 + strcat(cmd, "=");
30354 + strcat(cmd, field_name);
30356 + /* Use the same filter as the compatible histogram */
30357 + saved_filter = find_trigger_filter(hist_data, file);
30358 + if (saved_filter) {
30359 + strcat(cmd, " if ");
30360 + strcat(cmd, saved_filter);
30363 + var_hist->cmd = kstrdup(cmd, GFP_KERNEL);
30364 + if (!var_hist->cmd) {
30367 + return ERR_PTR(-ENOMEM);
30370 + /* Save the compatible histogram information */
30371 + var_hist->hist_data = hist_data;
30373 + /* Create the new histogram with our variable */
30374 + ret = event_hist_trigger_func(&trigger_hist_cmd, file,
30375 + "", "hist", cmd);
30378 + kfree(var_hist->cmd);
30380 + hist_err_event("onmatch: Couldn't create histogram for field: ",
30381 + subsys_name, event_name, field_name);
30382 + return ERR_PTR(ret);
30387 + /* If we can't find the variable, something went wrong */
30388 + event_var = find_synthetic_field_var(target_hist_data, subsys_name,
30389 + event_name, field_name);
30390 + if (IS_ERR_OR_NULL(event_var)) {
30391 + kfree(var_hist->cmd);
30393 + hist_err_event("onmatch: Couldn't find synthetic variable: ",
30394 + subsys_name, event_name, field_name);
30395 + return ERR_PTR(-EINVAL);
30398 + n = target_hist_data->n_field_var_hists;
30399 + target_hist_data->field_var_hists[n] = var_hist;
30400 + target_hist_data->n_field_var_hists++;
30402 + return event_var;
30405 +static struct hist_field *
30406 +find_target_event_var(struct hist_trigger_data *hist_data,
30407 + char *subsys_name, char *event_name, char *var_name)
30409 + struct trace_event_file *file = hist_data->event_file;
30410 + struct hist_field *hist_field = NULL;
30412 + if (subsys_name) {
30413 + struct trace_event_call *call;
30418 + call = file->event_call;
30420 + if (strcmp(subsys_name, call->class->system) != 0)
30423 + if (strcmp(event_name, trace_event_name(call)) != 0)
30427 + hist_field = find_var_field(hist_data, var_name);
30429 + return hist_field;
30432 +static inline void __update_field_vars(struct tracing_map_elt *elt,
30433 + struct ring_buffer_event *rbe,
30435 + struct field_var **field_vars,
30436 + unsigned int n_field_vars,
30437 + unsigned int field_var_str_start)
30439 + struct hist_elt_data *elt_data = elt->private_data;
30440 + unsigned int i, j, var_idx;
30443 + for (i = 0, j = field_var_str_start; i < n_field_vars; i++) {
30444 + struct field_var *field_var = field_vars[i];
30445 + struct hist_field *var = field_var->var;
30446 + struct hist_field *val = field_var->val;
30448 + var_val = val->fn(val, elt, rbe, rec);
30449 + var_idx = var->var.idx;
30451 + if (val->flags & HIST_FIELD_FL_STRING) {
30452 + char *str = elt_data->field_var_str[j++];
30453 + char *val_str = (char *)(uintptr_t)var_val;
30455 + strscpy(str, val_str, STR_VAR_LEN_MAX);
30456 + var_val = (u64)(uintptr_t)str;
30458 + tracing_map_set_var(elt, var_idx, var_val);
30462 +static void update_field_vars(struct hist_trigger_data *hist_data,
30463 + struct tracing_map_elt *elt,
30464 + struct ring_buffer_event *rbe,
30467 + __update_field_vars(elt, rbe, rec, hist_data->field_vars,
30468 + hist_data->n_field_vars, 0);
30471 +static void update_max_vars(struct hist_trigger_data *hist_data,
30472 + struct tracing_map_elt *elt,
30473 + struct ring_buffer_event *rbe,
30476 + __update_field_vars(elt, rbe, rec, hist_data->max_vars,
30477 + hist_data->n_max_vars, hist_data->n_field_var_str);
30480 +static struct hist_field *create_var(struct hist_trigger_data *hist_data,
30481 + struct trace_event_file *file,
30482 + char *name, int size, const char *type)
30484 + struct hist_field *var;
30487 + if (find_var(hist_data, file, name) && !hist_data->remove) {
30488 + var = ERR_PTR(-EINVAL);
30492 + var = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
30494 + var = ERR_PTR(-ENOMEM);
30498 + idx = tracing_map_add_var(hist_data->map);
30501 + var = ERR_PTR(-EINVAL);
30505 + var->flags = HIST_FIELD_FL_VAR;
30506 + var->var.idx = idx;
30507 + var->var.hist_data = var->hist_data = hist_data;
30508 + var->size = size;
30509 + var->var.name = kstrdup(name, GFP_KERNEL);
30510 + var->type = kstrdup(type, GFP_KERNEL);
30511 + if (!var->var.name || !var->type) {
30512 + kfree(var->var.name);
30513 + kfree(var->type);
30515 + var = ERR_PTR(-ENOMEM);
30521 +static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
30522 + struct trace_event_file *file,
30523 + char *field_name)
30525 + struct hist_field *val = NULL, *var = NULL;
30526 + unsigned long flags = HIST_FIELD_FL_VAR;
30527 + struct field_var *field_var;
30530 - if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
30531 + if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) {
30532 + hist_err("Too many field variables defined: ", field_name);
30537 + val = parse_atom(hist_data, file, field_name, &flags, NULL);
30538 + if (IS_ERR(val)) {
30539 + hist_err("Couldn't parse field variable: ", field_name);
30540 + ret = PTR_ERR(val);
30544 + var = create_var(hist_data, file, field_name, val->size, val->type);
30545 + if (IS_ERR(var)) {
30546 + hist_err("Couldn't create or find variable: ", field_name);
30548 + ret = PTR_ERR(var);
30552 + field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL);
30553 + if (!field_var) {
30560 + field_var->var = var;
30561 + field_var->val = val;
30563 + return field_var;
30565 + field_var = ERR_PTR(ret);
30570 + * create_target_field_var - Automatically create a variable for a field
30571 + * @target_hist_data: The target hist trigger
30572 + * @subsys_name: Optional subsystem name
30573 + * @event_name: Optional event name
30574 + * @var_name: The name of the field (and the resulting variable)
30576 + * Hist trigger actions fetch data from variables, not directly from
30577 + * events. However, for convenience, users are allowed to directly
30578 + * specify an event field in an action, which will be automatically
30579 + * converted into a variable on their behalf.
30581 + * This function creates a field variable with the name var_name on
30582 + * the hist trigger currently being defined on the target event. If
30583 + * subsys_name and event_name are specified, this function simply
30584 + * verifies that they do in fact match the target event subsystem and
30587 + * Return: The variable created for the field.
30589 +static struct field_var *
30590 +create_target_field_var(struct hist_trigger_data *target_hist_data,
30591 + char *subsys_name, char *event_name, char *var_name)
30593 + struct trace_event_file *file = target_hist_data->event_file;
30595 + if (subsys_name) {
30596 + struct trace_event_call *call;
30601 + call = file->event_call;
30603 + if (strcmp(subsys_name, call->class->system) != 0)
30606 + if (strcmp(event_name, trace_event_name(call)) != 0)
30610 + return create_field_var(target_hist_data, file, var_name);
30613 +static void onmax_print(struct seq_file *m,
30614 + struct hist_trigger_data *hist_data,
30615 + struct tracing_map_elt *elt,
30616 + struct action_data *data)
30618 + unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx;
30620 + seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx));
30622 + for (i = 0; i < hist_data->n_max_vars; i++) {
30623 + struct hist_field *save_val = hist_data->max_vars[i]->val;
30624 + struct hist_field *save_var = hist_data->max_vars[i]->var;
30627 + save_var_idx = save_var->var.idx;
30629 + val = tracing_map_read_var(elt, save_var_idx);
30631 + if (save_val->flags & HIST_FIELD_FL_STRING) {
30632 + seq_printf(m, " %s: %-32s", save_var->var.name,
30633 + (char *)(uintptr_t)(val));
30635 + seq_printf(m, " %s: %10llu", save_var->var.name, val);
30639 +static void onmax_save(struct hist_trigger_data *hist_data,
30640 + struct tracing_map_elt *elt, void *rec,
30641 + struct ring_buffer_event *rbe,
30642 + struct action_data *data, u64 *var_ref_vals)
30644 + unsigned int max_idx = data->onmax.max_var->var.idx;
30645 + unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx;
30647 + u64 var_val, max_val;
30649 + var_val = var_ref_vals[max_var_ref_idx];
30650 + max_val = tracing_map_read_var(elt, max_idx);
30652 + if (var_val <= max_val)
30655 + tracing_map_set_var(elt, max_idx, var_val);
30657 + update_max_vars(hist_data, elt, rbe, rec);
30660 +static void onmax_destroy(struct action_data *data)
30664 + destroy_hist_field(data->onmax.max_var, 0);
30665 + destroy_hist_field(data->onmax.var, 0);
30667 + kfree(data->onmax.var_str);
30668 + kfree(data->onmax.fn_name);
30670 + for (i = 0; i < data->n_params; i++)
30671 + kfree(data->params[i]);
30676 +static int onmax_create(struct hist_trigger_data *hist_data,
30677 + struct action_data *data)
30679 + struct trace_event_file *file = hist_data->event_file;
30680 + struct hist_field *var_field, *ref_field, *max_var;
30681 + unsigned int var_ref_idx = hist_data->n_var_refs;
30682 + struct field_var *field_var;
30683 + char *onmax_var_str, *param;
30684 + unsigned long flags;
30688 + onmax_var_str = data->onmax.var_str;
30689 + if (onmax_var_str[0] != '$') {
30690 + hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str);
30695 - field_name = strsep(&field_str, ".");
30697 - if (strcmp(field_str, "hex") == 0)
30698 - flags |= HIST_FIELD_FL_HEX;
30700 + var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str);
30701 + if (!var_field) {
30702 + hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str);
30706 + flags = HIST_FIELD_FL_VAR_REF;
30707 + ref_field = create_hist_field(hist_data, NULL, flags, NULL);
30711 + if (init_var_ref(ref_field, var_field, NULL, NULL)) {
30712 + destroy_hist_field(ref_field, 0);
30716 + hist_data->var_refs[hist_data->n_var_refs] = ref_field;
30717 + ref_field->var_ref_idx = hist_data->n_var_refs++;
30718 + data->onmax.var = ref_field;
30720 + data->fn = onmax_save;
30721 + data->onmax.max_var_ref_idx = var_ref_idx;
30722 + max_var = create_var(hist_data, file, "max", sizeof(u64), "u64");
30723 + if (IS_ERR(max_var)) {
30724 + hist_err("onmax: Couldn't create onmax variable: ", "max");
30725 + ret = PTR_ERR(max_var);
30728 + data->onmax.max_var = max_var;
30730 + for (i = 0; i < data->n_params; i++) {
30731 + param = kstrdup(data->params[i], GFP_KERNEL);
30737 + field_var = create_target_field_var(hist_data, NULL, NULL, param);
30738 + if (IS_ERR(field_var)) {
30739 + hist_err("onmax: Couldn't create field variable: ", param);
30740 + ret = PTR_ERR(field_var);
30745 + hist_data->max_vars[hist_data->n_max_vars++] = field_var;
30746 + if (field_var->val->flags & HIST_FIELD_FL_STRING)
30747 + hist_data->n_max_var_str++;
30755 +static int parse_action_params(char *params, struct action_data *data)
30757 + char *param, *saved_param;
30761 + if (data->n_params >= SYNTH_FIELDS_MAX)
30764 + param = strsep(¶ms, ",");
30770 + param = strstrip(param);
30771 + if (strlen(param) < 2) {
30772 + hist_err("Invalid action param: ", param);
30777 + saved_param = kstrdup(param, GFP_KERNEL);
30778 + if (!saved_param) {
30783 + data->params[data->n_params++] = saved_param;
30789 - field = trace_find_event_field(file->event_call, field_name);
30790 - if (!field || !field->size) {
30791 +static struct action_data *onmax_parse(char *str)
30793 + char *onmax_fn_name, *onmax_var_str;
30794 + struct action_data *data;
30795 + int ret = -EINVAL;
30797 + data = kzalloc(sizeof(*data), GFP_KERNEL);
30799 + return ERR_PTR(-ENOMEM);
30801 + onmax_var_str = strsep(&str, ")");
30802 + if (!onmax_var_str || !str) {
30808 - hist_data->fields[val_idx] = create_hist_field(field, flags);
30809 - if (!hist_data->fields[val_idx]) {
30810 + data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL);
30811 + if (!data->onmax.var_str) {
30816 + strsep(&str, ".");
30820 + onmax_fn_name = strsep(&str, "(");
30821 + if (!onmax_fn_name || !str)
30824 + if (strncmp(onmax_fn_name, "save", strlen("save")) == 0) {
30825 + char *params = strsep(&str, ")");
30832 + ret = parse_action_params(params, data);
30838 + data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL);
30839 + if (!data->onmax.fn_name) {
30846 + onmax_destroy(data);
30847 + data = ERR_PTR(ret);
30851 +static void onmatch_destroy(struct action_data *data)
30855 + mutex_lock(&synth_event_mutex);
30857 + kfree(data->onmatch.match_event);
30858 + kfree(data->onmatch.match_event_system);
30859 + kfree(data->onmatch.synth_event_name);
30861 + for (i = 0; i < data->n_params; i++)
30862 + kfree(data->params[i]);
30864 + if (data->onmatch.synth_event)
30865 + data->onmatch.synth_event->ref--;
30869 + mutex_unlock(&synth_event_mutex);
30872 +static void destroy_field_var(struct field_var *field_var)
30877 + destroy_hist_field(field_var->var, 0);
30878 + destroy_hist_field(field_var->val, 0);
30880 + kfree(field_var);
30883 +static void destroy_field_vars(struct hist_trigger_data *hist_data)
30887 + for (i = 0; i < hist_data->n_field_vars; i++)
30888 + destroy_field_var(hist_data->field_vars[i]);
30891 +static void save_field_var(struct hist_trigger_data *hist_data,
30892 + struct field_var *field_var)
30894 + hist_data->field_vars[hist_data->n_field_vars++] = field_var;
30896 + if (field_var->val->flags & HIST_FIELD_FL_STRING)
30897 + hist_data->n_field_var_str++;
30901 +static void destroy_synth_var_refs(struct hist_trigger_data *hist_data)
30905 + for (i = 0; i < hist_data->n_synth_var_refs; i++)
30906 + destroy_hist_field(hist_data->synth_var_refs[i], 0);
30909 +static void save_synth_var_ref(struct hist_trigger_data *hist_data,
30910 + struct hist_field *var_ref)
30912 + hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref;
30914 + hist_data->var_refs[hist_data->n_var_refs] = var_ref;
30915 + var_ref->var_ref_idx = hist_data->n_var_refs++;
30918 +static int check_synth_field(struct synth_event *event,
30919 + struct hist_field *hist_field,
30920 + unsigned int field_pos)
30922 + struct synth_field *field;
30924 + if (field_pos >= event->n_fields)
30927 + field = event->fields[field_pos];
30929 + if (strcmp(field->type, hist_field->type) != 0)
30935 +static struct hist_field *
30936 +onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data,
30937 + char *system, char *event, char *var)
30939 + struct hist_field *hist_field;
30941 + var++; /* skip '$' */
30943 + hist_field = find_target_event_var(hist_data, system, event, var);
30944 + if (!hist_field) {
30946 + system = data->onmatch.match_event_system;
30947 + event = data->onmatch.match_event;
30950 + hist_field = find_event_var(hist_data, system, event, var);
30954 + hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var);
30956 + return hist_field;
30959 +static struct hist_field *
30960 +onmatch_create_field_var(struct hist_trigger_data *hist_data,
30961 + struct action_data *data, char *system,
30962 + char *event, char *var)
30964 + struct hist_field *hist_field = NULL;
30965 + struct field_var *field_var;
30968 + * First try to create a field var on the target event (the
30969 + * currently being defined). This will create a variable for
30970 + * unqualified fields on the target event, or if qualified,
30971 + * target fields that have qualified names matching the target.
30973 + field_var = create_target_field_var(hist_data, system, event, var);
30975 + if (field_var && !IS_ERR(field_var)) {
30976 + save_field_var(hist_data, field_var);
30977 + hist_field = field_var->var;
30979 + field_var = NULL;
30981 + * If no explicit system.event is specfied, default to
30982 + * looking for fields on the onmatch(system.event.xxx)
30986 + system = data->onmatch.match_event_system;
30987 + event = data->onmatch.match_event;
30991 + * At this point, we're looking at a field on another
30992 + * event. Because we can't modify a hist trigger on
30993 + * another event to add a variable for a field, we need
30994 + * to create a new trigger on that event and create the
30995 + * variable at the same time.
30997 + hist_field = create_field_var_hist(hist_data, system, event, var);
30998 + if (IS_ERR(hist_field))
31002 + return hist_field;
31004 + destroy_field_var(field_var);
31005 + hist_field = NULL;
31009 +static int onmatch_create(struct hist_trigger_data *hist_data,
31010 + struct trace_event_file *file,
31011 + struct action_data *data)
31013 + char *event_name, *param, *system = NULL;
31014 + struct hist_field *hist_field, *var_ref;
31015 + unsigned int i, var_ref_idx;
31016 + unsigned int field_pos = 0;
31017 + struct synth_event *event;
31020 + mutex_lock(&synth_event_mutex);
31021 + event = find_synth_event(data->onmatch.synth_event_name);
31023 + hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name);
31024 + mutex_unlock(&synth_event_mutex);
31028 + mutex_unlock(&synth_event_mutex);
31030 + var_ref_idx = hist_data->n_var_refs;
31032 + for (i = 0; i < data->n_params; i++) {
31035 + p = param = kstrdup(data->params[i], GFP_KERNEL);
31041 + system = strsep(¶m, ".");
31043 + param = (char *)system;
31044 + system = event_name = NULL;
31046 + event_name = strsep(¶m, ".");
31054 + if (param[0] == '$')
31055 + hist_field = onmatch_find_var(hist_data, data, system,
31056 + event_name, param);
31058 + hist_field = onmatch_create_field_var(hist_data, data,
31063 + if (!hist_field) {
31069 + if (check_synth_field(event, hist_field, field_pos) == 0) {
31070 + var_ref = create_var_ref(hist_field, system, event_name);
31077 + save_synth_var_ref(hist_data, var_ref);
31083 + hist_err_event("onmatch: Param type doesn't match synthetic event field type: ",
31084 + system, event_name, param);
31090 + if (field_pos != event->n_fields) {
31091 + hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name);
31096 + data->fn = action_trace;
31097 + data->onmatch.synth_event = event;
31098 + data->onmatch.var_ref_idx = var_ref_idx;
31102 + mutex_lock(&synth_event_mutex);
31104 + mutex_unlock(&synth_event_mutex);
31109 +static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
31111 + char *match_event, *match_event_system;
31112 + char *synth_event_name, *params;
31113 + struct action_data *data;
31114 + int ret = -EINVAL;
31116 + data = kzalloc(sizeof(*data), GFP_KERNEL);
31118 + return ERR_PTR(-ENOMEM);
31120 + match_event = strsep(&str, ")");
31121 + if (!match_event || !str) {
31122 + hist_err("onmatch: Missing closing paren: ", match_event);
31126 + match_event_system = strsep(&match_event, ".");
31127 + if (!match_event) {
31128 + hist_err("onmatch: Missing subsystem for match event: ", match_event_system);
31132 + if (IS_ERR(event_file(tr, match_event_system, match_event))) {
31133 + hist_err_event("onmatch: Invalid subsystem or event name: ",
31134 + match_event_system, match_event, NULL);
31138 + data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL);
31139 + if (!data->onmatch.match_event) {
31144 + data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL);
31145 + if (!data->onmatch.match_event_system) {
31150 + strsep(&str, ".");
31152 + hist_err("onmatch: Missing . after onmatch(): ", str);
31156 + synth_event_name = strsep(&str, "(");
31157 + if (!synth_event_name || !str) {
31158 + hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name);
31162 + data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL);
31163 + if (!data->onmatch.synth_event_name) {
31168 + params = strsep(&str, ")");
31169 + if (!params || !str || (str && strlen(str))) {
31170 + hist_err("onmatch: Missing closing paramlist paren: ", params);
31174 + ret = parse_action_params(params, data);
31180 + onmatch_destroy(data);
31181 + data = ERR_PTR(ret);
31185 +static int create_hitcount_val(struct hist_trigger_data *hist_data)
31187 + hist_data->fields[HITCOUNT_IDX] =
31188 + create_hist_field(hist_data, NULL, HIST_FIELD_FL_HITCOUNT, NULL);
31189 + if (!hist_data->fields[HITCOUNT_IDX])
31192 + hist_data->n_vals++;
31193 + hist_data->n_fields++;
31195 + if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
31201 +static int __create_val_field(struct hist_trigger_data *hist_data,
31202 + unsigned int val_idx,
31203 + struct trace_event_file *file,
31204 + char *var_name, char *field_str,
31205 + unsigned long flags)
31207 + struct hist_field *hist_field;
31210 + hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0);
31211 + if (IS_ERR(hist_field)) {
31212 + ret = PTR_ERR(hist_field);
31216 + hist_data->fields[val_idx] = hist_field;
31218 ++hist_data->n_vals;
31219 + ++hist_data->n_fields;
31221 - if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
31222 + if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
31228 +static int create_val_field(struct hist_trigger_data *hist_data,
31229 + unsigned int val_idx,
31230 + struct trace_event_file *file,
31233 + if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
31236 + return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0);
31239 +static int create_var_field(struct hist_trigger_data *hist_data,
31240 + unsigned int val_idx,
31241 + struct trace_event_file *file,
31242 + char *var_name, char *expr_str)
31244 + unsigned long flags = 0;
31246 + if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
31249 + if (find_var(hist_data, file, var_name) && !hist_data->remove) {
31250 + hist_err("Variable already defined: ", var_name);
31254 + flags |= HIST_FIELD_FL_VAR;
31255 + hist_data->n_vars++;
31256 + if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX))
31259 + return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags);
31262 static int create_val_fields(struct hist_trigger_data *hist_data,
31263 struct trace_event_file *file)
31265 char *fields_str, *field_str;
31266 - unsigned int i, j;
31267 + unsigned int i, j = 1;
31270 ret = create_hitcount_val(hist_data);
31271 @@ -493,12 +3912,15 @@
31272 field_str = strsep(&fields_str, ",");
31276 if (strcmp(field_str, "hitcount") == 0)
31279 ret = create_val_field(hist_data, j++, file, field_str);
31284 if (fields_str && (strcmp(fields_str, "hitcount") != 0))
31287 @@ -511,12 +3933,13 @@
31288 struct trace_event_file *file,
31291 - struct ftrace_event_field *field = NULL;
31292 + struct hist_field *hist_field = NULL;
31294 unsigned long flags = 0;
31295 unsigned int key_size;
31298 - if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX))
31299 + if (WARN_ON(key_idx >= HIST_FIELDS_MAX))
31302 flags |= HIST_FIELD_FL_KEY;
31303 @@ -524,57 +3947,40 @@
31304 if (strcmp(field_str, "stacktrace") == 0) {
31305 flags |= HIST_FIELD_FL_STACKTRACE;
31306 key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH;
31307 + hist_field = create_hist_field(hist_data, NULL, flags, NULL);
31309 - char *field_name = strsep(&field_str, ".");
31312 - if (strcmp(field_str, "hex") == 0)
31313 - flags |= HIST_FIELD_FL_HEX;
31314 - else if (strcmp(field_str, "sym") == 0)
31315 - flags |= HIST_FIELD_FL_SYM;
31316 - else if (strcmp(field_str, "sym-offset") == 0)
31317 - flags |= HIST_FIELD_FL_SYM_OFFSET;
31318 - else if ((strcmp(field_str, "execname") == 0) &&
31319 - (strcmp(field_name, "common_pid") == 0))
31320 - flags |= HIST_FIELD_FL_EXECNAME;
31321 - else if (strcmp(field_str, "syscall") == 0)
31322 - flags |= HIST_FIELD_FL_SYSCALL;
31323 - else if (strcmp(field_str, "log2") == 0)
31324 - flags |= HIST_FIELD_FL_LOG2;
31329 + hist_field = parse_expr(hist_data, file, field_str, flags,
31331 + if (IS_ERR(hist_field)) {
31332 + ret = PTR_ERR(hist_field);
31336 - field = trace_find_event_field(file->event_call, field_name);
31337 - if (!field || !field->size) {
31338 + if (hist_field->flags & HIST_FIELD_FL_VAR_REF) {
31339 + hist_err("Using variable references as keys not supported: ", field_str);
31340 + destroy_hist_field(hist_field, 0);
31345 - if (is_string_field(field))
31346 - key_size = MAX_FILTER_STR_VAL;
31348 - key_size = field->size;
31349 + key_size = hist_field->size;
31352 - hist_data->fields[key_idx] = create_hist_field(field, flags);
31353 - if (!hist_data->fields[key_idx]) {
31357 + hist_data->fields[key_idx] = hist_field;
31359 key_size = ALIGN(key_size, sizeof(u64));
31360 hist_data->fields[key_idx]->size = key_size;
31361 hist_data->fields[key_idx]->offset = key_offset;
31363 hist_data->key_size += key_size;
31365 if (hist_data->key_size > HIST_KEY_SIZE_MAX) {
31370 hist_data->n_keys++;
31371 + hist_data->n_fields++;
31373 if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX))
31375 @@ -618,21 +4024,113 @@
31379 +static int create_var_fields(struct hist_trigger_data *hist_data,
31380 + struct trace_event_file *file)
31382 + unsigned int i, j = hist_data->n_vals;
31385 + unsigned int n_vars = hist_data->attrs->var_defs.n_vars;
31387 + for (i = 0; i < n_vars; i++) {
31388 + char *var_name = hist_data->attrs->var_defs.name[i];
31389 + char *expr = hist_data->attrs->var_defs.expr[i];
31391 + ret = create_var_field(hist_data, j++, file, var_name, expr);
31399 +static void free_var_defs(struct hist_trigger_data *hist_data)
31403 + for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
31404 + kfree(hist_data->attrs->var_defs.name[i]);
31405 + kfree(hist_data->attrs->var_defs.expr[i]);
31408 + hist_data->attrs->var_defs.n_vars = 0;
31411 +static int parse_var_defs(struct hist_trigger_data *hist_data)
31413 + char *s, *str, *var_name, *field_str;
31414 + unsigned int i, j, n_vars = 0;
31417 + for (i = 0; i < hist_data->attrs->n_assignments; i++) {
31418 + str = hist_data->attrs->assignment_str[i];
31419 + for (j = 0; j < TRACING_MAP_VARS_MAX; j++) {
31420 + field_str = strsep(&str, ",");
31424 + var_name = strsep(&field_str, "=");
31425 + if (!var_name || !field_str) {
31426 + hist_err("Malformed assignment: ", var_name);
31431 + if (n_vars == TRACING_MAP_VARS_MAX) {
31432 + hist_err("Too many variables defined: ", var_name);
31437 + s = kstrdup(var_name, GFP_KERNEL);
31442 + hist_data->attrs->var_defs.name[n_vars] = s;
31444 + s = kstrdup(field_str, GFP_KERNEL);
31446 + kfree(hist_data->attrs->var_defs.name[n_vars]);
31450 + hist_data->attrs->var_defs.expr[n_vars++] = s;
31452 + hist_data->attrs->var_defs.n_vars = n_vars;
31458 + free_var_defs(hist_data);
31463 static int create_hist_fields(struct hist_trigger_data *hist_data,
31464 struct trace_event_file *file)
31468 + ret = parse_var_defs(hist_data);
31472 ret = create_val_fields(hist_data, file);
31476 - ret = create_key_fields(hist_data, file);
31477 + ret = create_var_fields(hist_data, file);
31481 - hist_data->n_fields = hist_data->n_vals + hist_data->n_keys;
31482 + ret = create_key_fields(hist_data, file);
31486 + free_var_defs(hist_data);
31491 @@ -653,10 +4151,9 @@
31492 static int create_sort_keys(struct hist_trigger_data *hist_data)
31494 char *fields_str = hist_data->attrs->sort_key_str;
31495 - struct ftrace_event_field *field = NULL;
31496 struct tracing_map_sort_key *sort_key;
31497 int descending, ret = 0;
31498 - unsigned int i, j;
31499 + unsigned int i, j, k;
31501 hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */
31503 @@ -670,7 +4167,9 @@
31506 for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) {
31507 + struct hist_field *hist_field;
31508 char *field_str, *field_name;
31509 + const char *test_name;
31511 sort_key = &hist_data->sort_keys[i];
31513 @@ -702,10 +4201,19 @@
31517 - for (j = 1; j < hist_data->n_fields; j++) {
31518 - field = hist_data->fields[j]->field;
31519 - if (field && (strcmp(field_name, field->name) == 0)) {
31520 - sort_key->field_idx = j;
31521 + for (j = 1, k = 1; j < hist_data->n_fields; j++) {
31522 + unsigned int idx;
31524 + hist_field = hist_data->fields[j];
31525 + if (hist_field->flags & HIST_FIELD_FL_VAR)
31530 + test_name = hist_field_name(hist_field, 0);
31532 + if (strcmp(field_name, test_name) == 0) {
31533 + sort_key->field_idx = idx;
31534 descending = is_descending(field_str);
31535 if (descending < 0) {
31537 @@ -720,16 +4228,230 @@
31542 hist_data->n_sort_keys = i;
31547 +static void destroy_actions(struct hist_trigger_data *hist_data)
31551 + for (i = 0; i < hist_data->n_actions; i++) {
31552 + struct action_data *data = hist_data->actions[i];
31554 + if (data->fn == action_trace)
31555 + onmatch_destroy(data);
31556 + else if (data->fn == onmax_save)
31557 + onmax_destroy(data);
31563 +static int parse_actions(struct hist_trigger_data *hist_data)
31565 + struct trace_array *tr = hist_data->event_file->tr;
31566 + struct action_data *data;
31571 + for (i = 0; i < hist_data->attrs->n_actions; i++) {
31572 + str = hist_data->attrs->action_str[i];
31574 + if (strncmp(str, "onmatch(", strlen("onmatch(")) == 0) {
31575 + char *action_str = str + strlen("onmatch(");
31577 + data = onmatch_parse(tr, action_str);
31578 + if (IS_ERR(data)) {
31579 + ret = PTR_ERR(data);
31582 + data->fn = action_trace;
31583 + } else if (strncmp(str, "onmax(", strlen("onmax(")) == 0) {
31584 + char *action_str = str + strlen("onmax(");
31586 + data = onmax_parse(action_str);
31587 + if (IS_ERR(data)) {
31588 + ret = PTR_ERR(data);
31591 + data->fn = onmax_save;
31597 + hist_data->actions[hist_data->n_actions++] = data;
31603 +static int create_actions(struct hist_trigger_data *hist_data,
31604 + struct trace_event_file *file)
31606 + struct action_data *data;
31610 + for (i = 0; i < hist_data->attrs->n_actions; i++) {
31611 + data = hist_data->actions[i];
31613 + if (data->fn == action_trace) {
31614 + ret = onmatch_create(hist_data, file, data);
31617 + } else if (data->fn == onmax_save) {
31618 + ret = onmax_create(hist_data, data);
31627 +static void print_actions(struct seq_file *m,
31628 + struct hist_trigger_data *hist_data,
31629 + struct tracing_map_elt *elt)
31633 + for (i = 0; i < hist_data->n_actions; i++) {
31634 + struct action_data *data = hist_data->actions[i];
31636 + if (data->fn == onmax_save)
31637 + onmax_print(m, hist_data, elt, data);
31641 +static void print_onmax_spec(struct seq_file *m,
31642 + struct hist_trigger_data *hist_data,
31643 + struct action_data *data)
31647 + seq_puts(m, ":onmax(");
31648 + seq_printf(m, "%s", data->onmax.var_str);
31649 + seq_printf(m, ").%s(", data->onmax.fn_name);
31651 + for (i = 0; i < hist_data->n_max_vars; i++) {
31652 + seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name);
31653 + if (i < hist_data->n_max_vars - 1)
31654 + seq_puts(m, ",");
31656 + seq_puts(m, ")");
31659 +static void print_onmatch_spec(struct seq_file *m,
31660 + struct hist_trigger_data *hist_data,
31661 + struct action_data *data)
31665 + seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system,
31666 + data->onmatch.match_event);
31668 + seq_printf(m, "%s(", data->onmatch.synth_event->name);
31670 + for (i = 0; i < data->n_params; i++) {
31672 + seq_puts(m, ",");
31673 + seq_printf(m, "%s", data->params[i]);
31676 + seq_puts(m, ")");
31679 +static bool actions_match(struct hist_trigger_data *hist_data,
31680 + struct hist_trigger_data *hist_data_test)
31682 + unsigned int i, j;
31684 + if (hist_data->n_actions != hist_data_test->n_actions)
31687 + for (i = 0; i < hist_data->n_actions; i++) {
31688 + struct action_data *data = hist_data->actions[i];
31689 + struct action_data *data_test = hist_data_test->actions[i];
31691 + if (data->fn != data_test->fn)
31694 + if (data->n_params != data_test->n_params)
31697 + for (j = 0; j < data->n_params; j++) {
31698 + if (strcmp(data->params[j], data_test->params[j]) != 0)
31702 + if (data->fn == action_trace) {
31703 + if (strcmp(data->onmatch.synth_event_name,
31704 + data_test->onmatch.synth_event_name) != 0)
31706 + if (strcmp(data->onmatch.match_event_system,
31707 + data_test->onmatch.match_event_system) != 0)
31709 + if (strcmp(data->onmatch.match_event,
31710 + data_test->onmatch.match_event) != 0)
31712 + } else if (data->fn == onmax_save) {
31713 + if (strcmp(data->onmax.var_str,
31714 + data_test->onmax.var_str) != 0)
31716 + if (strcmp(data->onmax.fn_name,
31717 + data_test->onmax.fn_name) != 0)
31726 +static void print_actions_spec(struct seq_file *m,
31727 + struct hist_trigger_data *hist_data)
31731 + for (i = 0; i < hist_data->n_actions; i++) {
31732 + struct action_data *data = hist_data->actions[i];
31734 + if (data->fn == action_trace)
31735 + print_onmatch_spec(m, hist_data, data);
31736 + else if (data->fn == onmax_save)
31737 + print_onmax_spec(m, hist_data, data);
31741 +static void destroy_field_var_hists(struct hist_trigger_data *hist_data)
31745 + for (i = 0; i < hist_data->n_field_var_hists; i++) {
31746 + kfree(hist_data->field_var_hists[i]->cmd);
31747 + kfree(hist_data->field_var_hists[i]);
31751 static void destroy_hist_data(struct hist_trigger_data *hist_data)
31756 destroy_hist_trigger_attrs(hist_data->attrs);
31757 destroy_hist_fields(hist_data);
31758 tracing_map_destroy(hist_data->map);
31760 + destroy_actions(hist_data);
31761 + destroy_field_vars(hist_data);
31762 + destroy_field_var_hists(hist_data);
31763 + destroy_synth_var_refs(hist_data);
31768 @@ -738,7 +4460,7 @@
31769 struct tracing_map *map = hist_data->map;
31770 struct ftrace_event_field *field;
31771 struct hist_field *hist_field;
31775 for_each_hist_field(i, hist_data) {
31776 hist_field = hist_data->fields[i];
31777 @@ -749,6 +4471,9 @@
31779 if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
31780 cmp_fn = tracing_map_cmp_none;
31782 + cmp_fn = tracing_map_cmp_num(hist_field->size,
31783 + hist_field->is_signed);
31784 else if (is_string_field(field))
31785 cmp_fn = tracing_map_cmp_string;
31787 @@ -757,36 +4482,29 @@
31788 idx = tracing_map_add_key_field(map,
31789 hist_field->offset,
31793 + } else if (!(hist_field->flags & HIST_FIELD_FL_VAR))
31794 idx = tracing_map_add_sum_field(map);
31803 -static bool need_tracing_map_ops(struct hist_trigger_data *hist_data)
31805 - struct hist_field *key_field;
31808 - for_each_hist_key_field(i, hist_data) {
31809 - key_field = hist_data->fields[i];
31811 - if (key_field->flags & HIST_FIELD_FL_EXECNAME)
31813 + if (hist_field->flags & HIST_FIELD_FL_VAR) {
31814 + idx = tracing_map_add_var(map);
31817 + hist_field->var.idx = idx;
31818 + hist_field->var.hist_data = hist_data;
31826 static struct hist_trigger_data *
31827 create_hist_data(unsigned int map_bits,
31828 struct hist_trigger_attrs *attrs,
31829 - struct trace_event_file *file)
31830 + struct trace_event_file *file,
31833 const struct tracing_map_ops *map_ops = NULL;
31834 struct hist_trigger_data *hist_data;
31835 @@ -797,6 +4515,12 @@
31836 return ERR_PTR(-ENOMEM);
31838 hist_data->attrs = attrs;
31839 + hist_data->remove = remove;
31840 + hist_data->event_file = file;
31842 + ret = parse_actions(hist_data);
31846 ret = create_hist_fields(hist_data, file);
31848 @@ -806,8 +4530,7 @@
31852 - if (need_tracing_map_ops(hist_data))
31853 - map_ops = &hist_trigger_elt_comm_ops;
31854 + map_ops = &hist_trigger_elt_data_ops;
31856 hist_data->map = tracing_map_create(map_bits, hist_data->key_size,
31857 map_ops, hist_data);
31858 @@ -820,12 +4543,6 @@
31859 ret = create_tracing_map_fields(hist_data);
31863 - ret = tracing_map_init(hist_data->map);
31867 - hist_data->event_file = file;
31871 @@ -839,18 +4556,39 @@
31874 static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
31875 - struct tracing_map_elt *elt,
31877 + struct tracing_map_elt *elt, void *rec,
31878 + struct ring_buffer_event *rbe,
31879 + u64 *var_ref_vals)
31881 + struct hist_elt_data *elt_data;
31882 struct hist_field *hist_field;
31884 + unsigned int i, var_idx;
31887 + elt_data = elt->private_data;
31888 + elt_data->var_ref_vals = var_ref_vals;
31890 for_each_hist_val_field(i, hist_data) {
31891 hist_field = hist_data->fields[i];
31892 - hist_val = hist_field->fn(hist_field, rec);
31893 + hist_val = hist_field->fn(hist_field, elt, rbe, rec);
31894 + if (hist_field->flags & HIST_FIELD_FL_VAR) {
31895 + var_idx = hist_field->var.idx;
31896 + tracing_map_set_var(elt, var_idx, hist_val);
31899 tracing_map_update_sum(elt, i, hist_val);
31902 + for_each_hist_key_field(i, hist_data) {
31903 + hist_field = hist_data->fields[i];
31904 + if (hist_field->flags & HIST_FIELD_FL_VAR) {
31905 + hist_val = hist_field->fn(hist_field, elt, rbe, rec);
31906 + var_idx = hist_field->var.idx;
31907 + tracing_map_set_var(elt, var_idx, hist_val);
31911 + update_field_vars(hist_data, elt, rbe, rec);
31914 static inline void add_to_key(char *compound_key, void *key,
31915 @@ -877,15 +4615,31 @@
31916 memcpy(compound_key + key_field->offset, key, size);
31919 -static void event_hist_trigger(struct event_trigger_data *data, void *rec)
31921 +hist_trigger_actions(struct hist_trigger_data *hist_data,
31922 + struct tracing_map_elt *elt, void *rec,
31923 + struct ring_buffer_event *rbe, u64 *var_ref_vals)
31925 + struct action_data *data;
31928 + for (i = 0; i < hist_data->n_actions; i++) {
31929 + data = hist_data->actions[i];
31930 + data->fn(hist_data, elt, rec, rbe, data, var_ref_vals);
31934 +static void event_hist_trigger(struct event_trigger_data *data, void *rec,
31935 + struct ring_buffer_event *rbe)
31937 struct hist_trigger_data *hist_data = data->private_data;
31938 bool use_compound_key = (hist_data->n_keys > 1);
31939 unsigned long entries[HIST_STACKTRACE_DEPTH];
31940 + u64 var_ref_vals[TRACING_MAP_VARS_MAX];
31941 char compound_key[HIST_KEY_SIZE_MAX];
31942 + struct tracing_map_elt *elt = NULL;
31943 struct stack_trace stacktrace;
31944 struct hist_field *key_field;
31945 - struct tracing_map_elt *elt;
31946 u64 field_contents;
31949 @@ -906,7 +4660,7 @@
31953 - field_contents = key_field->fn(key_field, rec);
31954 + field_contents = key_field->fn(key_field, elt, rbe, rec);
31955 if (key_field->flags & HIST_FIELD_FL_STRING) {
31956 key = (void *)(unsigned long)field_contents;
31957 use_compound_key = true;
31958 @@ -921,9 +4675,18 @@
31959 if (use_compound_key)
31960 key = compound_key;
31962 + if (hist_data->n_var_refs &&
31963 + !resolve_var_refs(hist_data, key, var_ref_vals, false))
31966 elt = tracing_map_insert(hist_data->map, key);
31968 - hist_trigger_elt_update(hist_data, elt, rec);
31972 + hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals);
31974 + if (resolve_var_refs(hist_data, key, var_ref_vals, true))
31975 + hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals);
31978 static void hist_trigger_stacktrace_print(struct seq_file *m,
31979 @@ -952,6 +4715,7 @@
31980 struct hist_field *key_field;
31981 char str[KSYM_SYMBOL_LEN];
31982 bool multiline = false;
31983 + const char *field_name;
31987 @@ -963,26 +4727,33 @@
31988 if (i > hist_data->n_vals)
31991 + field_name = hist_field_name(key_field, 0);
31993 if (key_field->flags & HIST_FIELD_FL_HEX) {
31994 uval = *(u64 *)(key + key_field->offset);
31995 - seq_printf(m, "%s: %llx",
31996 - key_field->field->name, uval);
31997 + seq_printf(m, "%s: %llx", field_name, uval);
31998 } else if (key_field->flags & HIST_FIELD_FL_SYM) {
31999 uval = *(u64 *)(key + key_field->offset);
32000 sprint_symbol_no_offset(str, uval);
32001 - seq_printf(m, "%s: [%llx] %-45s",
32002 - key_field->field->name, uval, str);
32003 + seq_printf(m, "%s: [%llx] %-45s", field_name,
32005 } else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) {
32006 uval = *(u64 *)(key + key_field->offset);
32007 sprint_symbol(str, uval);
32008 - seq_printf(m, "%s: [%llx] %-55s",
32009 - key_field->field->name, uval, str);
32010 + seq_printf(m, "%s: [%llx] %-55s", field_name,
32012 } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
32013 - char *comm = elt->private_data;
32014 + struct hist_elt_data *elt_data = elt->private_data;
32017 + if (WARN_ON_ONCE(!elt_data))
32020 + comm = elt_data->comm;
32022 uval = *(u64 *)(key + key_field->offset);
32023 - seq_printf(m, "%s: %-16s[%10llu]",
32024 - key_field->field->name, comm, uval);
32025 + seq_printf(m, "%s: %-16s[%10llu]", field_name,
32027 } else if (key_field->flags & HIST_FIELD_FL_SYSCALL) {
32028 const char *syscall_name;
32030 @@ -991,8 +4762,8 @@
32032 syscall_name = "unknown_syscall";
32034 - seq_printf(m, "%s: %-30s[%3llu]",
32035 - key_field->field->name, syscall_name, uval);
32036 + seq_printf(m, "%s: %-30s[%3llu]", field_name,
32037 + syscall_name, uval);
32038 } else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
32039 seq_puts(m, "stacktrace:\n");
32040 hist_trigger_stacktrace_print(m,
32041 @@ -1000,15 +4771,14 @@
32042 HIST_STACKTRACE_DEPTH);
32044 } else if (key_field->flags & HIST_FIELD_FL_LOG2) {
32045 - seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name,
32046 + seq_printf(m, "%s: ~ 2^%-2llu", field_name,
32047 *(u64 *)(key + key_field->offset));
32048 } else if (key_field->flags & HIST_FIELD_FL_STRING) {
32049 - seq_printf(m, "%s: %-50s", key_field->field->name,
32050 + seq_printf(m, "%s: %-50s", field_name,
32051 (char *)(key + key_field->offset));
32053 uval = *(u64 *)(key + key_field->offset);
32054 - seq_printf(m, "%s: %10llu", key_field->field->name,
32056 + seq_printf(m, "%s: %10llu", field_name, uval);
32060 @@ -1021,17 +4791,23 @@
32061 tracing_map_read_sum(elt, HITCOUNT_IDX));
32063 for (i = 1; i < hist_data->n_vals; i++) {
32064 + field_name = hist_field_name(hist_data->fields[i], 0);
32066 + if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR ||
32067 + hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR)
32070 if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
32071 - seq_printf(m, " %s: %10llx",
32072 - hist_data->fields[i]->field->name,
32073 + seq_printf(m, " %s: %10llx", field_name,
32074 tracing_map_read_sum(elt, i));
32076 - seq_printf(m, " %s: %10llu",
32077 - hist_data->fields[i]->field->name,
32078 + seq_printf(m, " %s: %10llu", field_name,
32079 tracing_map_read_sum(elt, i));
32083 + print_actions(m, hist_data, elt);
32088 @@ -1102,6 +4878,11 @@
32089 hist_trigger_show(m, data, n++);
32092 + if (have_hist_err()) {
32093 + seq_printf(m, "\nERROR: %s\n", hist_err_str);
32094 + seq_printf(m, " Last command: %s\n", last_hist_cmd);
32098 mutex_unlock(&event_mutex);
32100 @@ -1120,34 +4901,31 @@
32101 .release = single_release,
32104 -static const char *get_hist_field_flags(struct hist_field *hist_field)
32105 +static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
32107 - const char *flags_str = NULL;
32108 + const char *field_name = hist_field_name(hist_field, 0);
32110 - if (hist_field->flags & HIST_FIELD_FL_HEX)
32111 - flags_str = "hex";
32112 - else if (hist_field->flags & HIST_FIELD_FL_SYM)
32113 - flags_str = "sym";
32114 - else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
32115 - flags_str = "sym-offset";
32116 - else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
32117 - flags_str = "execname";
32118 - else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
32119 - flags_str = "syscall";
32120 - else if (hist_field->flags & HIST_FIELD_FL_LOG2)
32121 - flags_str = "log2";
32122 + if (hist_field->var.name)
32123 + seq_printf(m, "%s=", hist_field->var.name);
32125 - return flags_str;
32127 + if (hist_field->flags & HIST_FIELD_FL_CPU)
32128 + seq_puts(m, "cpu");
32129 + else if (field_name) {
32130 + if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
32131 + hist_field->flags & HIST_FIELD_FL_ALIAS)
32132 + seq_putc(m, '$');
32133 + seq_printf(m, "%s", field_name);
32134 + } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP)
32135 + seq_puts(m, "common_timestamp");
32137 -static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
32139 - seq_printf(m, "%s", hist_field->field->name);
32140 if (hist_field->flags) {
32141 - const char *flags_str = get_hist_field_flags(hist_field);
32142 + if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) &&
32143 + !(hist_field->flags & HIST_FIELD_FL_EXPR)) {
32144 + const char *flags = get_hist_field_flags(hist_field);
32147 - seq_printf(m, ".%s", flags_str);
32149 + seq_printf(m, ".%s", flags);
32154 @@ -1156,7 +4934,8 @@
32155 struct event_trigger_data *data)
32157 struct hist_trigger_data *hist_data = data->private_data;
32158 - struct hist_field *key_field;
32159 + struct hist_field *field;
32160 + bool have_var = false;
32163 seq_puts(m, "hist:");
32164 @@ -1167,25 +4946,47 @@
32165 seq_puts(m, "keys=");
32167 for_each_hist_key_field(i, hist_data) {
32168 - key_field = hist_data->fields[i];
32169 + field = hist_data->fields[i];
32171 if (i > hist_data->n_vals)
32174 - if (key_field->flags & HIST_FIELD_FL_STACKTRACE)
32175 + if (field->flags & HIST_FIELD_FL_STACKTRACE)
32176 seq_puts(m, "stacktrace");
32178 - hist_field_print(m, key_field);
32179 + hist_field_print(m, field);
32182 seq_puts(m, ":vals=");
32184 for_each_hist_val_field(i, hist_data) {
32185 + field = hist_data->fields[i];
32186 + if (field->flags & HIST_FIELD_FL_VAR) {
32191 if (i == HITCOUNT_IDX)
32192 seq_puts(m, "hitcount");
32195 - hist_field_print(m, hist_data->fields[i]);
32196 + hist_field_print(m, field);
32201 + unsigned int n = 0;
32203 + seq_puts(m, ":");
32205 + for_each_hist_val_field(i, hist_data) {
32206 + field = hist_data->fields[i];
32208 + if (field->flags & HIST_FIELD_FL_VAR) {
32210 + seq_puts(m, ",");
32211 + hist_field_print(m, field);
32216 @@ -1193,28 +4994,36 @@
32218 for (i = 0; i < hist_data->n_sort_keys; i++) {
32219 struct tracing_map_sort_key *sort_key;
32220 + unsigned int idx, first_key_idx;
32222 + /* skip VAR vals */
32223 + first_key_idx = hist_data->n_vals - hist_data->n_vars;
32225 sort_key = &hist_data->sort_keys[i];
32226 + idx = sort_key->field_idx;
32228 + if (WARN_ON(idx >= HIST_FIELDS_MAX))
32234 - if (sort_key->field_idx == HITCOUNT_IDX)
32235 + if (idx == HITCOUNT_IDX)
32236 seq_puts(m, "hitcount");
32238 - unsigned int idx = sort_key->field_idx;
32240 - if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX))
32243 + if (idx >= first_key_idx)
32244 + idx += hist_data->n_vars;
32245 hist_field_print(m, hist_data->fields[idx]);
32248 if (sort_key->descending)
32249 seq_puts(m, ".descending");
32252 seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
32253 + if (hist_data->enable_timestamps)
32254 + seq_printf(m, ":clock=%s", hist_data->attrs->clock);
32256 + print_actions_spec(m, hist_data);
32258 if (data->filter_str)
32259 seq_printf(m, " if %s", data->filter_str);
32260 @@ -1242,6 +5051,21 @@
32264 +static void unregister_field_var_hists(struct hist_trigger_data *hist_data)
32266 + struct trace_event_file *file;
32271 + for (i = 0; i < hist_data->n_field_var_hists; i++) {
32272 + file = hist_data->field_var_hists[i]->hist_data->event_file;
32273 + cmd = hist_data->field_var_hists[i]->cmd;
32274 + ret = event_hist_trigger_func(&trigger_hist_cmd, file,
32275 + "!hist", "hist", cmd);
32279 static void event_hist_trigger_free(struct event_trigger_ops *ops,
32280 struct event_trigger_data *data)
32282 @@ -1254,7 +5078,13 @@
32285 del_named_trigger(data);
32287 trigger_data_free(data);
32289 + remove_hist_vars(hist_data);
32291 + unregister_field_var_hists(hist_data);
32293 destroy_hist_data(hist_data);
32296 @@ -1381,6 +5211,15 @@
32298 if (key_field->offset != key_field_test->offset)
32300 + if (key_field->size != key_field_test->size)
32302 + if (key_field->is_signed != key_field_test->is_signed)
32304 + if (!!key_field->var.name != !!key_field_test->var.name)
32306 + if (key_field->var.name &&
32307 + strcmp(key_field->var.name, key_field_test->var.name) != 0)
32311 for (i = 0; i < hist_data->n_sort_keys; i++) {
32312 @@ -1396,6 +5235,9 @@
32313 (strcmp(data->filter_str, data_test->filter_str) != 0))
32316 + if (!actions_match(hist_data, hist_data_test))
32322 @@ -1412,6 +5254,7 @@
32324 if (!hist_trigger_match(data, named_data, named_data,
32326 + hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name);
32330 @@ -1431,13 +5274,16 @@
32331 test->paused = false;
32332 else if (hist_data->attrs->clear)
32336 + hist_err("Hist trigger already exists", NULL);
32343 if (hist_data->attrs->cont || hist_data->attrs->clear) {
32344 + hist_err("Can't clear or continue a nonexistent hist trigger", NULL);
32348 @@ -1446,7 +5292,6 @@
32349 data->paused = true;
32352 - destroy_hist_data(data->private_data);
32353 data->private_data = named_data->private_data;
32354 set_named_trigger_data(data, named_data);
32355 data->ops = &event_hist_trigger_named_ops;
32356 @@ -1458,8 +5303,32 @@
32360 - list_add_rcu(&data->list, &file->triggers);
32361 + if (hist_data->enable_timestamps) {
32362 + char *clock = hist_data->attrs->clock;
32364 + ret = tracing_set_clock(file->tr, hist_data->attrs->clock);
32366 + hist_err("Couldn't set trace_clock: ", clock);
32370 + tracing_set_time_stamp_abs(file->tr, true);
32374 + destroy_hist_data(hist_data);
32381 +static int hist_trigger_enable(struct event_trigger_data *data,
32382 + struct trace_event_file *file)
32386 + list_add_tail_rcu(&data->list, &file->triggers);
32388 update_cond_flag(file);
32390 @@ -1468,10 +5337,55 @@
32391 update_cond_flag(file);
32399 +static bool have_hist_trigger_match(struct event_trigger_data *data,
32400 + struct trace_event_file *file)
32402 + struct hist_trigger_data *hist_data = data->private_data;
32403 + struct event_trigger_data *test, *named_data = NULL;
32404 + bool match = false;
32406 + if (hist_data->attrs->name)
32407 + named_data = find_named_trigger(hist_data->attrs->name);
32409 + list_for_each_entry_rcu(test, &file->triggers, list) {
32410 + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
32411 + if (hist_trigger_match(data, test, named_data, false)) {
32421 +static bool hist_trigger_check_refs(struct event_trigger_data *data,
32422 + struct trace_event_file *file)
32424 + struct hist_trigger_data *hist_data = data->private_data;
32425 + struct event_trigger_data *test, *named_data = NULL;
32427 + if (hist_data->attrs->name)
32428 + named_data = find_named_trigger(hist_data->attrs->name);
32430 + list_for_each_entry_rcu(test, &file->triggers, list) {
32431 + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
32432 + if (!hist_trigger_match(data, test, named_data, false))
32434 + hist_data = test->private_data;
32435 + if (check_var_refs(hist_data))
32444 static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
32445 struct event_trigger_data *data,
32446 struct trace_event_file *file)
32447 @@ -1497,17 +5411,55 @@
32449 if (unregistered && test->ops->free)
32450 test->ops->free(test->ops, test);
32452 + if (hist_data->enable_timestamps) {
32453 + if (!hist_data->remove || unregistered)
32454 + tracing_set_time_stamp_abs(file->tr, false);
32458 +static bool hist_file_check_refs(struct trace_event_file *file)
32460 + struct hist_trigger_data *hist_data;
32461 + struct event_trigger_data *test;
32463 + list_for_each_entry_rcu(test, &file->triggers, list) {
32464 + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
32465 + hist_data = test->private_data;
32466 + if (check_var_refs(hist_data))
32474 static void hist_unreg_all(struct trace_event_file *file)
32476 struct event_trigger_data *test, *n;
32477 + struct hist_trigger_data *hist_data;
32478 + struct synth_event *se;
32479 + const char *se_name;
32481 + if (hist_file_check_refs(file))
32484 list_for_each_entry_safe(test, n, &file->triggers, list) {
32485 if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
32486 + hist_data = test->private_data;
32487 list_del_rcu(&test->list);
32488 trace_event_trigger_enable_disable(file, 0);
32490 + mutex_lock(&synth_event_mutex);
32491 + se_name = trace_event_name(file->event_call);
32492 + se = find_synth_event(se_name);
32495 + mutex_unlock(&synth_event_mutex);
32497 update_cond_flag(file);
32498 + if (hist_data->enable_timestamps)
32499 + tracing_set_time_stamp_abs(file->tr, false);
32500 if (test->ops->free)
32501 test->ops->free(test->ops, test);
32503 @@ -1523,16 +5475,54 @@
32504 struct hist_trigger_attrs *attrs;
32505 struct event_trigger_ops *trigger_ops;
32506 struct hist_trigger_data *hist_data;
32508 + struct synth_event *se;
32509 + const char *se_name;
32510 + bool remove = false;
32511 + char *trigger, *p;
32514 + if (glob && strlen(glob)) {
32515 + last_cmd_set(param);
32516 + hist_err_clear();
32522 - /* separate the trigger from the filter (k:v [if filter]) */
32523 - trigger = strsep(¶m, " \t");
32526 + if (glob[0] == '!')
32530 + * separate the trigger from the filter (k:v [if filter])
32531 + * allowing for whitespace in the trigger
32533 + p = trigger = param;
32535 + p = strstr(p, "if");
32540 + if (*(p - 1) != ' ' && *(p - 1) != '\t') {
32544 + if (p >= param + strlen(param) - strlen("if") - 1)
32546 + if (*(p + strlen("if")) != ' ' && *(p + strlen("if")) != '\t') {
32557 + param = strstrip(p);
32558 + trigger = strstrip(trigger);
32561 attrs = parse_hist_trigger_attrs(trigger);
32563 @@ -1541,7 +5531,7 @@
32564 if (attrs->map_bits)
32565 hist_trigger_bits = attrs->map_bits;
32567 - hist_data = create_hist_data(hist_trigger_bits, attrs, file);
32568 + hist_data = create_hist_data(hist_trigger_bits, attrs, file, remove);
32569 if (IS_ERR(hist_data)) {
32570 destroy_hist_trigger_attrs(attrs);
32571 return PTR_ERR(hist_data);
32572 @@ -1549,10 +5539,11 @@
32574 trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
32577 trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
32578 - if (!trigger_data)
32579 + if (!trigger_data) {
32584 trigger_data->count = -1;
32585 trigger_data->ops = trigger_ops;
32586 @@ -1570,8 +5561,24 @@
32590 - if (glob[0] == '!') {
32592 + if (!have_hist_trigger_match(trigger_data, file))
32595 + if (hist_trigger_check_refs(trigger_data, file)) {
32600 cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
32602 + mutex_lock(&synth_event_mutex);
32603 + se_name = trace_event_name(file->event_call);
32604 + se = find_synth_event(se_name);
32607 + mutex_unlock(&synth_event_mutex);
32612 @@ -1588,14 +5595,47 @@
32614 } else if (ret < 0)
32617 + if (get_named_trigger_data(trigger_data))
32620 + if (has_hist_vars(hist_data))
32621 + save_hist_vars(hist_data);
32623 + ret = create_actions(hist_data, file);
32627 + ret = tracing_map_init(hist_data->map);
32631 + ret = hist_trigger_enable(trigger_data, file);
32635 + mutex_lock(&synth_event_mutex);
32636 + se_name = trace_event_name(file->event_call);
32637 + se = find_synth_event(se_name);
32640 + mutex_unlock(&synth_event_mutex);
32642 /* Just return zero, not the number of registered triggers */
32646 + hist_err_clear();
32650 + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
32652 if (cmd_ops->set_filter)
32653 cmd_ops->set_filter(NULL, trigger_data, NULL);
32655 + remove_hist_vars(hist_data);
32657 kfree(trigger_data);
32659 destroy_hist_data(hist_data);
32660 @@ -1625,7 +5665,8 @@
32664 -hist_enable_trigger(struct event_trigger_data *data, void *rec)
32665 +hist_enable_trigger(struct event_trigger_data *data, void *rec,
32666 + struct ring_buffer_event *event)
32668 struct enable_trigger_data *enable_data = data->private_data;
32669 struct event_trigger_data *test;
32670 @@ -1641,7 +5682,8 @@
32674 -hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
32675 +hist_enable_count_trigger(struct event_trigger_data *data, void *rec,
32676 + struct ring_buffer_event *event)
32680 @@ -1649,7 +5691,7 @@
32681 if (data->count != -1)
32684 - hist_enable_trigger(data, rec);
32685 + hist_enable_trigger(data, rec, event);
32688 static struct event_trigger_ops hist_enable_trigger_ops = {
32689 @@ -1754,3 +5796,31 @@
32694 +static __init int trace_events_hist_init(void)
32696 + struct dentry *entry = NULL;
32697 + struct dentry *d_tracer;
32700 + d_tracer = tracing_init_dentry();
32701 + if (IS_ERR(d_tracer)) {
32702 + err = PTR_ERR(d_tracer);
32706 + entry = tracefs_create_file("synthetic_events", 0644, d_tracer,
32707 + NULL, &synth_events_fops);
32715 + pr_warn("Could not create tracefs 'synthetic_events' entry\n");
32720 +fs_initcall(trace_events_hist_init);
32721 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_events_trigger.c linux-4.14/kernel/trace/trace_events_trigger.c
32722 --- linux-4.14.orig/kernel/trace/trace_events_trigger.c 2018-09-05 11:03:22.000000000 +0200
32723 +++ linux-4.14/kernel/trace/trace_events_trigger.c 2018-09-05 11:05:07.000000000 +0200
32725 * any trigger that should be deferred, ETT_NONE if nothing to defer.
32727 enum event_trigger_type
32728 -event_triggers_call(struct trace_event_file *file, void *rec)
32729 +event_triggers_call(struct trace_event_file *file, void *rec,
32730 + struct ring_buffer_event *event)
32732 struct event_trigger_data *data;
32733 enum event_trigger_type tt = ETT_NONE;
32738 - data->ops->func(data, rec);
32739 + data->ops->func(data, rec, event);
32742 filter = rcu_dereference_sched(data->filter);
32744 tt |= data->cmd_ops->trigger_type;
32747 - data->ops->func(data, rec);
32748 + data->ops->func(data, rec, event);
32752 @@ -108,7 +109,7 @@
32754 event_triggers_post_call(struct trace_event_file *file,
32755 enum event_trigger_type tt,
32757 + void *rec, struct ring_buffer_event *event)
32759 struct event_trigger_data *data;
32761 @@ -116,7 +117,7 @@
32764 if (data->cmd_ops->trigger_type & tt)
32765 - data->ops->func(data, rec);
32766 + data->ops->func(data, rec, event);
32769 EXPORT_SYMBOL_GPL(event_triggers_post_call);
32770 @@ -914,8 +915,15 @@
32771 data->named_data = named_data;
32774 +struct event_trigger_data *
32775 +get_named_trigger_data(struct event_trigger_data *data)
32777 + return data->named_data;
32781 -traceon_trigger(struct event_trigger_data *data, void *rec)
32782 +traceon_trigger(struct event_trigger_data *data, void *rec,
32783 + struct ring_buffer_event *event)
32785 if (tracing_is_on())
32787 @@ -924,7 +932,8 @@
32791 -traceon_count_trigger(struct event_trigger_data *data, void *rec)
32792 +traceon_count_trigger(struct event_trigger_data *data, void *rec,
32793 + struct ring_buffer_event *event)
32795 if (tracing_is_on())
32797 @@ -939,7 +948,8 @@
32801 -traceoff_trigger(struct event_trigger_data *data, void *rec)
32802 +traceoff_trigger(struct event_trigger_data *data, void *rec,
32803 + struct ring_buffer_event *event)
32805 if (!tracing_is_on())
32807 @@ -948,7 +958,8 @@
32811 -traceoff_count_trigger(struct event_trigger_data *data, void *rec)
32812 +traceoff_count_trigger(struct event_trigger_data *data, void *rec,
32813 + struct ring_buffer_event *event)
32815 if (!tracing_is_on())
32817 @@ -1045,7 +1056,8 @@
32819 #ifdef CONFIG_TRACER_SNAPSHOT
32821 -snapshot_trigger(struct event_trigger_data *data, void *rec)
32822 +snapshot_trigger(struct event_trigger_data *data, void *rec,
32823 + struct ring_buffer_event *event)
32825 struct trace_event_file *file = data->private_data;
32827 @@ -1056,7 +1068,8 @@
32831 -snapshot_count_trigger(struct event_trigger_data *data, void *rec)
32832 +snapshot_count_trigger(struct event_trigger_data *data, void *rec,
32833 + struct ring_buffer_event *event)
32837 @@ -1064,7 +1077,7 @@
32838 if (data->count != -1)
32841 - snapshot_trigger(data, rec);
32842 + snapshot_trigger(data, rec, event);
32846 @@ -1143,13 +1156,15 @@
32847 #define STACK_SKIP 3
32850 -stacktrace_trigger(struct event_trigger_data *data, void *rec)
32851 +stacktrace_trigger(struct event_trigger_data *data, void *rec,
32852 + struct ring_buffer_event *event)
32854 trace_dump_stack(STACK_SKIP);
32858 -stacktrace_count_trigger(struct event_trigger_data *data, void *rec)
32859 +stacktrace_count_trigger(struct event_trigger_data *data, void *rec,
32860 + struct ring_buffer_event *event)
32864 @@ -1157,7 +1172,7 @@
32865 if (data->count != -1)
32868 - stacktrace_trigger(data, rec);
32869 + stacktrace_trigger(data, rec, event);
32873 @@ -1219,7 +1234,8 @@
32877 -event_enable_trigger(struct event_trigger_data *data, void *rec)
32878 +event_enable_trigger(struct event_trigger_data *data, void *rec,
32879 + struct ring_buffer_event *event)
32881 struct enable_trigger_data *enable_data = data->private_data;
32883 @@ -1230,7 +1246,8 @@
32887 -event_enable_count_trigger(struct event_trigger_data *data, void *rec)
32888 +event_enable_count_trigger(struct event_trigger_data *data, void *rec,
32889 + struct ring_buffer_event *event)
32891 struct enable_trigger_data *enable_data = data->private_data;
32893 @@ -1244,7 +1261,7 @@
32894 if (data->count != -1)
32897 - event_enable_trigger(data, rec);
32898 + event_enable_trigger(data, rec, event);
32901 int event_enable_trigger_print(struct seq_file *m,
32902 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace.h linux-4.14/kernel/trace/trace.h
32903 --- linux-4.14.orig/kernel/trace/trace.h 2018-09-05 11:03:22.000000000 +0200
32904 +++ linux-4.14/kernel/trace/trace.h 2018-09-05 11:05:07.000000000 +0200
32905 @@ -127,6 +127,7 @@
32906 * NEED_RESCHED - reschedule is requested
32907 * HARDIRQ - inside an interrupt handler
32908 * SOFTIRQ - inside a softirq handler
32909 + * NEED_RESCHED_LAZY - lazy reschedule is requested
32911 enum trace_flag_type {
32912 TRACE_FLAG_IRQS_OFF = 0x01,
32913 @@ -136,6 +137,7 @@
32914 TRACE_FLAG_SOFTIRQ = 0x10,
32915 TRACE_FLAG_PREEMPT_RESCHED = 0x20,
32916 TRACE_FLAG_NMI = 0x40,
32917 + TRACE_FLAG_NEED_RESCHED_LAZY = 0x80,
32920 #define TRACE_BUF_SIZE 1024
32921 @@ -273,6 +275,8 @@
32922 /* function tracing enabled */
32923 int function_enabled;
32925 + int time_stamp_abs_ref;
32926 + struct list_head hist_vars;
32930 @@ -286,6 +290,11 @@
32931 extern int trace_array_get(struct trace_array *tr);
32932 extern void trace_array_put(struct trace_array *tr);
32934 +extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs);
32935 +extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
32937 +extern bool trace_clock_in_ns(struct trace_array *tr);
32940 * The global tracer (top) should be the first trace array added,
32941 * but we check the flag anyway.
32942 @@ -1293,7 +1302,7 @@
32943 unsigned long eflags = file->flags;
32945 if (eflags & EVENT_FILE_FL_TRIGGER_COND)
32946 - *tt = event_triggers_call(file, entry);
32947 + *tt = event_triggers_call(file, entry, event);
32949 if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
32950 (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
32951 @@ -1330,7 +1339,7 @@
32952 trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
32955 - event_triggers_post_call(file, tt, entry);
32956 + event_triggers_post_call(file, tt, entry, event);
32960 @@ -1363,7 +1372,7 @@
32961 irq_flags, pc, regs);
32964 - event_triggers_post_call(file, tt, entry);
32965 + event_triggers_post_call(file, tt, entry, event);
32968 #define FILTER_PRED_INVALID ((unsigned short)-1)
32969 @@ -1545,6 +1554,8 @@
32970 extern void unpause_named_trigger(struct event_trigger_data *data);
32971 extern void set_named_trigger_data(struct event_trigger_data *data,
32972 struct event_trigger_data *named_data);
32973 +extern struct event_trigger_data *
32974 +get_named_trigger_data(struct event_trigger_data *data);
32975 extern int register_event_command(struct event_command *cmd);
32976 extern int unregister_event_command(struct event_command *cmd);
32977 extern int register_trigger_hist_enable_disable_cmds(void);
32978 @@ -1588,7 +1599,8 @@
32980 struct event_trigger_ops {
32981 void (*func)(struct event_trigger_data *data,
32984 + struct ring_buffer_event *rbe);
32985 int (*init)(struct event_trigger_ops *ops,
32986 struct event_trigger_data *data);
32987 void (*free)(struct event_trigger_ops *ops,
32988 @@ -1755,6 +1767,13 @@
32989 int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
32990 int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
32992 +#define MAX_EVENT_NAME_LEN 64
32994 +extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
32995 +extern ssize_t trace_parse_run_command(struct file *file,
32996 + const char __user *buffer, size_t count, loff_t *ppos,
32997 + int (*createfn)(int, char**));
33000 * Normal trace_printk() and friends allocates special buffers
33001 * to do the manipulation, as well as saves the print formats
33002 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_hwlat.c linux-4.14/kernel/trace/trace_hwlat.c
33003 --- linux-4.14.orig/kernel/trace/trace_hwlat.c 2017-11-12 19:46:13.000000000 +0100
33004 +++ linux-4.14/kernel/trace/trace_hwlat.c 2018-09-05 11:05:07.000000000 +0200
33005 @@ -279,7 +279,7 @@
33006 * of this thread, than stop migrating for the duration
33007 * of the current test.
33009 - if (!cpumask_equal(current_mask, ¤t->cpus_allowed))
33010 + if (!cpumask_equal(current_mask, current->cpus_ptr))
33014 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_kprobe.c linux-4.14/kernel/trace/trace_kprobe.c
33015 --- linux-4.14.orig/kernel/trace/trace_kprobe.c 2018-09-05 11:03:22.000000000 +0200
33016 +++ linux-4.14/kernel/trace/trace_kprobe.c 2018-09-05 11:05:07.000000000 +0200
33017 @@ -918,8 +918,8 @@
33018 static ssize_t probes_write(struct file *file, const char __user *buffer,
33019 size_t count, loff_t *ppos)
33021 - return traceprobe_probes_write(file, buffer, count, ppos,
33022 - create_trace_kprobe);
33023 + return trace_parse_run_command(file, buffer, count, ppos,
33024 + create_trace_kprobe);
33027 static const struct file_operations kprobe_events_ops = {
33028 @@ -1444,9 +1444,9 @@
33030 pr_info("Testing kprobe tracing: ");
33032 - ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
33033 - "$stack $stack0 +0($stack)",
33034 - create_trace_kprobe);
33035 + ret = trace_run_command("p:testprobe kprobe_trace_selftest_target "
33036 + "$stack $stack0 +0($stack)",
33037 + create_trace_kprobe);
33038 if (WARN_ON_ONCE(ret)) {
33039 pr_warn("error on probing function entry.\n");
33041 @@ -1466,8 +1466,8 @@
33045 - ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
33046 - "$retval", create_trace_kprobe);
33047 + ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target "
33048 + "$retval", create_trace_kprobe);
33049 if (WARN_ON_ONCE(ret)) {
33050 pr_warn("error on probing function return.\n");
33052 @@ -1537,13 +1537,13 @@
33053 disable_trace_kprobe(tk, file);
33056 - ret = traceprobe_command("-:testprobe", create_trace_kprobe);
33057 + ret = trace_run_command("-:testprobe", create_trace_kprobe);
33058 if (WARN_ON_ONCE(ret)) {
33059 pr_warn("error on deleting a probe.\n");
33063 - ret = traceprobe_command("-:testprobe2", create_trace_kprobe);
33064 + ret = trace_run_command("-:testprobe2", create_trace_kprobe);
33065 if (WARN_ON_ONCE(ret)) {
33066 pr_warn("error on deleting a probe.\n");
33068 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_output.c linux-4.14/kernel/trace/trace_output.c
33069 --- linux-4.14.orig/kernel/trace/trace_output.c 2018-09-05 11:03:22.000000000 +0200
33070 +++ linux-4.14/kernel/trace/trace_output.c 2018-09-05 11:05:07.000000000 +0200
33071 @@ -447,6 +447,7 @@
33075 + char need_resched_lazy;
33079 @@ -477,6 +478,9 @@
33083 + need_resched_lazy =
33084 + (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
33087 (nmi && hardirq) ? 'Z' :
33089 @@ -485,14 +489,25 @@
33093 - trace_seq_printf(s, "%c%c%c",
33094 - irqs_off, need_resched, hardsoft_irq);
33095 + trace_seq_printf(s, "%c%c%c%c",
33096 + irqs_off, need_resched, need_resched_lazy,
33099 if (entry->preempt_count)
33100 trace_seq_printf(s, "%x", entry->preempt_count);
33102 trace_seq_putc(s, '.');
33104 + if (entry->preempt_lazy_count)
33105 + trace_seq_printf(s, "%x", entry->preempt_lazy_count);
33107 + trace_seq_putc(s, '.');
33109 + if (entry->migrate_disable)
33110 + trace_seq_printf(s, "%x", entry->migrate_disable);
33112 + trace_seq_putc(s, '.');
33114 return !trace_seq_has_overflowed(s);
33117 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_probe.c linux-4.14/kernel/trace/trace_probe.c
33118 --- linux-4.14.orig/kernel/trace/trace_probe.c 2018-09-05 11:03:22.000000000 +0200
33119 +++ linux-4.14/kernel/trace/trace_probe.c 2018-09-05 11:05:07.000000000 +0200
33120 @@ -621,92 +621,6 @@
33124 -int traceprobe_command(const char *buf, int (*createfn)(int, char **))
33131 - argv = argv_split(GFP_KERNEL, buf, &argc);
33136 - ret = createfn(argc, argv);
33143 -#define WRITE_BUFSIZE 4096
33145 -ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
33146 - size_t count, loff_t *ppos,
33147 - int (*createfn)(int, char **))
33149 - char *kbuf, *buf, *tmp;
33154 - kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
33158 - while (done < count) {
33159 - size = count - done;
33161 - if (size >= WRITE_BUFSIZE)
33162 - size = WRITE_BUFSIZE - 1;
33164 - if (copy_from_user(kbuf, buffer + done, size)) {
33168 - kbuf[size] = '\0';
33171 - tmp = strchr(buf, '\n');
33174 - size = tmp - buf + 1;
33176 - size = strlen(buf);
33177 - if (done + size < count) {
33180 - /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
33181 - pr_warn("Line length is too long: Should be less than %d\n",
33182 - WRITE_BUFSIZE - 2);
33189 - /* Remove comments */
33190 - tmp = strchr(buf, '#');
33195 - ret = traceprobe_command(buf, createfn);
33200 - } while (done < count);
33210 static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
33213 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_probe.h linux-4.14/kernel/trace/trace_probe.h
33214 --- linux-4.14.orig/kernel/trace/trace_probe.h 2018-09-05 11:03:22.000000000 +0200
33215 +++ linux-4.14/kernel/trace/trace_probe.h 2018-09-05 11:05:07.000000000 +0200
33218 #define MAX_TRACE_ARGS 128
33219 #define MAX_ARGSTR_LEN 63
33220 -#define MAX_EVENT_NAME_LEN 64
33221 #define MAX_STRING_SIZE PATH_MAX
33223 /* Reserved field names */
33224 @@ -356,12 +355,6 @@
33226 extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
33228 -extern ssize_t traceprobe_probes_write(struct file *file,
33229 - const char __user *buffer, size_t count, loff_t *ppos,
33230 - int (*createfn)(int, char**));
33232 -extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
33234 /* Sum up total data length for dynamic arraies (strings) */
33235 static nokprobe_inline int
33236 __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
33237 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_uprobe.c linux-4.14/kernel/trace/trace_uprobe.c
33238 --- linux-4.14.orig/kernel/trace/trace_uprobe.c 2018-09-05 11:03:22.000000000 +0200
33239 +++ linux-4.14/kernel/trace/trace_uprobe.c 2018-09-05 11:05:07.000000000 +0200
33240 @@ -647,7 +647,7 @@
33241 static ssize_t probes_write(struct file *file, const char __user *buffer,
33242 size_t count, loff_t *ppos)
33244 - return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
33245 + return trace_parse_run_command(file, buffer, count, ppos, create_trace_uprobe);
33248 static const struct file_operations uprobe_events_ops = {
33249 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/tracing_map.c linux-4.14/kernel/trace/tracing_map.c
33250 --- linux-4.14.orig/kernel/trace/tracing_map.c 2017-11-12 19:46:13.000000000 +0100
33251 +++ linux-4.14/kernel/trace/tracing_map.c 2018-09-05 11:05:07.000000000 +0200
33253 return (u64)atomic64_read(&elt->fields[i].sum);
33257 + * tracing_map_set_var - Assign a tracing_map_elt's variable field
33258 + * @elt: The tracing_map_elt
33259 + * @i: The index of the given variable associated with the tracing_map_elt
33260 + * @n: The value to assign
33262 + * Assign n to variable i associated with the specified tracing_map_elt
33263 + * instance. The index i is the index returned by the call to
33264 + * tracing_map_add_var() when the tracing map was set up.
33266 +void tracing_map_set_var(struct tracing_map_elt *elt, unsigned int i, u64 n)
33268 + atomic64_set(&elt->vars[i], n);
33269 + elt->var_set[i] = true;
33273 + * tracing_map_var_set - Return whether or not a variable has been set
33274 + * @elt: The tracing_map_elt
33275 + * @i: The index of the given variable associated with the tracing_map_elt
33277 + * Return true if the variable has been set, false otherwise. The
33278 + * index i is the index returned by the call to tracing_map_add_var()
33279 + * when the tracing map was set up.
33281 +bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i)
33283 + return elt->var_set[i];
33287 + * tracing_map_read_var - Return the value of a tracing_map_elt's variable field
33288 + * @elt: The tracing_map_elt
33289 + * @i: The index of the given variable associated with the tracing_map_elt
33291 + * Retrieve the value of the variable i associated with the specified
33292 + * tracing_map_elt instance. The index i is the index returned by the
33293 + * call to tracing_map_add_var() when the tracing map was set
33296 + * Return: The variable value associated with field i for elt.
33298 +u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i)
33300 + return (u64)atomic64_read(&elt->vars[i]);
33304 + * tracing_map_read_var_once - Return and reset a tracing_map_elt's variable field
33305 + * @elt: The tracing_map_elt
33306 + * @i: The index of the given variable associated with the tracing_map_elt
33308 + * Retrieve the value of the variable i associated with the specified
33309 + * tracing_map_elt instance, and reset the variable to the 'not set'
33310 + * state. The index i is the index returned by the call to
33311 + * tracing_map_add_var() when the tracing map was set up. The reset
33312 + * essentially makes the variable a read-once variable if it's only
33313 + * accessed using this function.
33315 + * Return: The variable value associated with field i for elt.
33317 +u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i)
33319 + elt->var_set[i] = false;
33320 + return (u64)atomic64_read(&elt->vars[i]);
33323 int tracing_map_cmp_string(void *val_a, void *val_b)
33326 @@ -171,6 +238,28 @@
33330 + * tracing_map_add_var - Add a field describing a tracing_map var
33331 + * @map: The tracing_map
33333 + * Add a var to the map and return the index identifying it in the map
33334 + * and associated tracing_map_elts. This is the index used for
33335 + * instance to update a var for a particular tracing_map_elt using
33336 + * tracing_map_update_var() or reading it via tracing_map_read_var().
33338 + * Return: The index identifying the var in the map and associated
33339 + * tracing_map_elts, or -EINVAL on error.
33341 +int tracing_map_add_var(struct tracing_map *map)
33343 + int ret = -EINVAL;
33345 + if (map->n_vars < TRACING_MAP_VARS_MAX)
33346 + ret = map->n_vars++;
33352 * tracing_map_add_key_field - Add a field describing a tracing_map key
33353 * @map: The tracing_map
33354 * @offset: The offset within the key
33355 @@ -280,6 +369,11 @@
33356 if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64)
33357 atomic64_set(&elt->fields[i].sum, 0);
33359 + for (i = 0; i < elt->map->n_vars; i++) {
33360 + atomic64_set(&elt->vars[i], 0);
33361 + elt->var_set[i] = false;
33364 if (elt->map->ops && elt->map->ops->elt_clear)
33365 elt->map->ops->elt_clear(elt);
33367 @@ -306,6 +400,8 @@
33368 if (elt->map->ops && elt->map->ops->elt_free)
33369 elt->map->ops->elt_free(elt);
33370 kfree(elt->fields);
33371 + kfree(elt->vars);
33372 + kfree(elt->var_set);
33376 @@ -333,6 +429,18 @@
33380 + elt->vars = kcalloc(map->n_vars, sizeof(*elt->vars), GFP_KERNEL);
33381 + if (!elt->vars) {
33386 + elt->var_set = kcalloc(map->n_vars, sizeof(*elt->var_set), GFP_KERNEL);
33387 + if (!elt->var_set) {
33392 tracing_map_elt_init_fields(elt);
33394 if (map->ops && map->ops->elt_alloc) {
33395 @@ -414,7 +522,9 @@
33396 __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
33398 u32 idx, key_hash, test_key;
33400 struct tracing_map_entry *entry;
33401 + struct tracing_map_elt *val;
33403 key_hash = jhash(key, map->key_size, 0);
33405 @@ -426,10 +536,33 @@
33406 entry = TRACING_MAP_ENTRY(map->map, idx);
33407 test_key = entry->key;
33409 - if (test_key && test_key == key_hash && entry->val &&
33410 - keys_match(key, entry->val->key, map->key_size)) {
33411 - atomic64_inc(&map->hits);
33412 - return entry->val;
33413 + if (test_key && test_key == key_hash) {
33414 + val = READ_ONCE(entry->val);
33416 + keys_match(key, val->key, map->key_size)) {
33417 + if (!lookup_only)
33418 + atomic64_inc(&map->hits);
33420 + } else if (unlikely(!val)) {
33422 + * The key is present. But, val (pointer to elt
33423 + * struct) is still NULL. which means some other
33424 + * thread is in the process of inserting an
33427 + * On top of that, it's key_hash is same as the
33428 + * one being inserted right now. So, it's
33429 + * possible that the element has the same
33434 + if (dup_try > map->map_size) {
33435 + atomic64_inc(&map->drops);
33443 @@ -451,6 +584,13 @@
33444 atomic64_inc(&map->hits);
33449 + * cmpxchg() failed. Loop around once
33450 + * more to check what key was inserted.
33457 @@ -815,67 +955,15 @@
33461 -static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt)
33463 - struct tracing_map_elt *dup_elt;
33466 - dup_elt = tracing_map_elt_alloc(elt->map);
33467 - if (IS_ERR(dup_elt))
33470 - if (elt->map->ops && elt->map->ops->elt_copy)
33471 - elt->map->ops->elt_copy(dup_elt, elt);
33473 - dup_elt->private_data = elt->private_data;
33474 - memcpy(dup_elt->key, elt->key, elt->map->key_size);
33476 - for (i = 0; i < elt->map->n_fields; i++) {
33477 - atomic64_set(&dup_elt->fields[i].sum,
33478 - atomic64_read(&elt->fields[i].sum));
33479 - dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn;
33485 -static int merge_dup(struct tracing_map_sort_entry **sort_entries,
33486 - unsigned int target, unsigned int dup)
33488 - struct tracing_map_elt *target_elt, *elt;
33489 - bool first_dup = (target - dup) == 1;
33493 - elt = sort_entries[target]->elt;
33494 - target_elt = copy_elt(elt);
33497 - sort_entries[target]->elt = target_elt;
33498 - sort_entries[target]->elt_copied = true;
33500 - target_elt = sort_entries[target]->elt;
33502 - elt = sort_entries[dup]->elt;
33504 - for (i = 0; i < elt->map->n_fields; i++)
33505 - atomic64_add(atomic64_read(&elt->fields[i].sum),
33506 - &target_elt->fields[i].sum);
33508 - sort_entries[dup]->dup = true;
33513 -static int merge_dups(struct tracing_map_sort_entry **sort_entries,
33514 +static void detect_dups(struct tracing_map_sort_entry **sort_entries,
33515 int n_entries, unsigned int key_size)
33517 unsigned int dups = 0, total_dups = 0;
33523 - return total_dups;
33526 sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *),
33527 (int (*)(const void *, const void *))cmp_entries_dup, NULL);
33528 @@ -884,30 +972,14 @@
33529 for (i = 1; i < n_entries; i++) {
33530 if (!memcmp(sort_entries[i]->key, key, key_size)) {
33531 dups++; total_dups++;
33532 - err = merge_dup(sort_entries, i - dups, i);
33537 key = sort_entries[i]->key;
33542 - return total_dups;
33544 - for (i = 0, j = 0; i < n_entries; i++) {
33545 - if (!sort_entries[i]->dup) {
33546 - sort_entries[j] = sort_entries[i];
33548 - sort_entries[i] = NULL;
33550 - destroy_sort_entry(sort_entries[i]);
33551 - sort_entries[i] = NULL;
33555 - return total_dups;
33556 + WARN_ONCE(total_dups > 0,
33557 + "Duplicates detected: %d\n", total_dups);
33560 static bool is_key(struct tracing_map *map, unsigned int field_idx)
33561 @@ -1033,10 +1105,7 @@
33565 - ret = merge_dups(entries, n_entries, map->key_size);
33568 - n_entries -= ret;
33569 + detect_dups(entries, n_entries, map->key_size);
33571 if (is_key(map, sort_keys[0].field_idx))
33572 cmp_entries_fn = cmp_entries_key;
33573 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/tracing_map.h linux-4.14/kernel/trace/tracing_map.h
33574 --- linux-4.14.orig/kernel/trace/tracing_map.h 2017-11-12 19:46:13.000000000 +0100
33575 +++ linux-4.14/kernel/trace/tracing_map.h 2018-09-05 11:05:07.000000000 +0200
33577 #define TRACING_MAP_BITS_MAX 17
33578 #define TRACING_MAP_BITS_MIN 7
33580 -#define TRACING_MAP_KEYS_MAX 2
33581 +#define TRACING_MAP_KEYS_MAX 3
33582 #define TRACING_MAP_VALS_MAX 3
33583 #define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \
33584 TRACING_MAP_VALS_MAX)
33585 +#define TRACING_MAP_VARS_MAX 16
33586 #define TRACING_MAP_SORT_KEYS_MAX 2
33588 typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
33589 @@ -137,6 +138,8 @@
33590 struct tracing_map_elt {
33591 struct tracing_map *map;
33592 struct tracing_map_field *fields;
33593 + atomic64_t *vars;
33596 void *private_data;
33598 @@ -192,6 +195,7 @@
33599 int key_idx[TRACING_MAP_KEYS_MAX];
33600 unsigned int n_keys;
33601 struct tracing_map_sort_key sort_key;
33602 + unsigned int n_vars;
33606 @@ -215,11 +219,6 @@
33607 * Element allocation occurs before tracing begins, when the
33608 * tracing_map_init() call is made by client code.
33610 - * @elt_copy: At certain points in the lifetime of an element, it may
33611 - * need to be copied. The copy should include a copy of the
33612 - * client-allocated data, which can be copied into the 'to'
33613 - * element from the 'from' element.
33615 * @elt_free: When a tracing_map_elt is freed, this function is called
33616 * and allows client-allocated per-element data to be freed.
33618 @@ -233,8 +232,6 @@
33620 struct tracing_map_ops {
33621 int (*elt_alloc)(struct tracing_map_elt *elt);
33622 - void (*elt_copy)(struct tracing_map_elt *to,
33623 - struct tracing_map_elt *from);
33624 void (*elt_free)(struct tracing_map_elt *elt);
33625 void (*elt_clear)(struct tracing_map_elt *elt);
33626 void (*elt_init)(struct tracing_map_elt *elt);
33627 @@ -248,6 +245,7 @@
33628 extern int tracing_map_init(struct tracing_map *map);
33630 extern int tracing_map_add_sum_field(struct tracing_map *map);
33631 +extern int tracing_map_add_var(struct tracing_map *map);
33632 extern int tracing_map_add_key_field(struct tracing_map *map,
33633 unsigned int offset,
33634 tracing_map_cmp_fn_t cmp_fn);
33635 @@ -267,7 +265,13 @@
33637 extern void tracing_map_update_sum(struct tracing_map_elt *elt,
33638 unsigned int i, u64 n);
33639 +extern void tracing_map_set_var(struct tracing_map_elt *elt,
33640 + unsigned int i, u64 n);
33641 +extern bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i);
33642 extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i);
33643 +extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i);
33644 +extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i);
33646 extern void tracing_map_set_field_descr(struct tracing_map *map,
33648 unsigned int key_offset,
33649 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/user.c linux-4.14/kernel/user.c
33650 --- linux-4.14.orig/kernel/user.c 2017-11-12 19:46:13.000000000 +0100
33651 +++ linux-4.14/kernel/user.c 2018-09-05 11:05:07.000000000 +0200
33652 @@ -162,11 +162,11 @@
33656 - local_irq_save(flags);
33657 + local_irq_save_nort(flags);
33658 if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
33659 free_user(up, flags);
33661 - local_irq_restore(flags);
33662 + local_irq_restore_nort(flags);
33665 struct user_struct *alloc_uid(kuid_t uid)
33666 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/watchdog.c linux-4.14/kernel/watchdog.c
33667 --- linux-4.14.orig/kernel/watchdog.c 2017-11-12 19:46:13.000000000 +0100
33668 +++ linux-4.14/kernel/watchdog.c 2018-09-05 11:05:07.000000000 +0200
33669 @@ -462,7 +462,7 @@
33670 * Start the timer first to prevent the NMI watchdog triggering
33671 * before the timer has a chance to fire.
33673 - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
33674 + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
33675 hrtimer->function = watchdog_timer_fn;
33676 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
33677 HRTIMER_MODE_REL_PINNED);
33678 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/watchdog_hld.c linux-4.14/kernel/watchdog_hld.c
33679 --- linux-4.14.orig/kernel/watchdog_hld.c 2017-11-12 19:46:13.000000000 +0100
33680 +++ linux-4.14/kernel/watchdog_hld.c 2018-09-05 11:05:07.000000000 +0200
33682 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
33683 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
33684 static DEFINE_PER_CPU(struct perf_event *, dead_event);
33685 +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
33687 static struct cpumask dead_events_mask;
33689 static unsigned long hardlockup_allcpu_dumped;
33690 @@ -134,6 +136,13 @@
33691 /* only print hardlockups once */
33692 if (__this_cpu_read(hard_watchdog_warn) == true)
33695 + * If early-printk is enabled then make sure we do not
33696 + * lock up in printk() and kill console logging:
33700 + raw_spin_lock(&watchdog_output_lock);
33702 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
33704 @@ -151,6 +160,7 @@
33705 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
33706 trigger_allbutself_cpu_backtrace();
33708 + raw_spin_unlock(&watchdog_output_lock);
33709 if (hardlockup_panic)
33710 nmi_panic(regs, "Hard LOCKUP");
33712 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/workqueue.c linux-4.14/kernel/workqueue.c
33713 --- linux-4.14.orig/kernel/workqueue.c 2018-09-05 11:03:22.000000000 +0200
33714 +++ linux-4.14/kernel/workqueue.c 2018-09-05 11:05:07.000000000 +0200
33716 #include <linux/moduleparam.h>
33717 #include <linux/uaccess.h>
33718 #include <linux/nmi.h>
33719 +#include <linux/locallock.h>
33720 +#include <linux/delay.h>
33722 #include "workqueue_internal.h"
33724 @@ -123,11 +125,16 @@
33725 * cpu or grabbing pool->lock is enough for read access. If
33726 * POOL_DISASSOCIATED is set, it's identical to L.
33728 + * On RT we need the extra protection via rt_lock_idle_list() for
33729 + * the list manipulations against read access from
33730 + * wq_worker_sleeping(). All other places are nicely serialized via
33733 * A: pool->attach_mutex protected.
33735 * PL: wq_pool_mutex protected.
33737 - * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads.
33738 + * PR: wq_pool_mutex protected for writes. RCU protected for reads.
33740 * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads.
33742 @@ -136,7 +143,7 @@
33744 * WQ: wq->mutex protected.
33746 - * WR: wq->mutex protected for writes. Sched-RCU protected for reads.
33747 + * WR: wq->mutex protected for writes. RCU protected for reads.
33749 * MD: wq_mayday_lock protected.
33751 @@ -186,7 +193,7 @@
33752 atomic_t nr_running ____cacheline_aligned_in_smp;
33755 - * Destruction of pool is sched-RCU protected to allow dereferences
33756 + * Destruction of pool is RCU protected to allow dereferences
33757 * from get_work_pool().
33759 struct rcu_head rcu;
33760 @@ -215,7 +222,7 @@
33762 * Release of unbound pwq is punted to system_wq. See put_pwq()
33763 * and pwq_unbound_release_workfn() for details. pool_workqueue
33764 - * itself is also sched-RCU protected so that the first pwq can be
33765 + * itself is also RCU protected so that the first pwq can be
33766 * determined without grabbing wq->mutex.
33768 struct work_struct unbound_release_work;
33769 @@ -352,6 +359,8 @@
33770 struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
33771 EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
33773 +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
33775 static int worker_thread(void *__worker);
33776 static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
33778 @@ -359,20 +368,20 @@
33779 #include <trace/events/workqueue.h>
33781 #define assert_rcu_or_pool_mutex() \
33782 - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
33783 + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
33784 !lockdep_is_held(&wq_pool_mutex), \
33785 - "sched RCU or wq_pool_mutex should be held")
33786 + "RCU or wq_pool_mutex should be held")
33788 #define assert_rcu_or_wq_mutex(wq) \
33789 - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
33790 + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
33791 !lockdep_is_held(&wq->mutex), \
33792 - "sched RCU or wq->mutex should be held")
33793 + "RCU or wq->mutex should be held")
33795 #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
33796 - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
33797 + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
33798 !lockdep_is_held(&wq->mutex) && \
33799 !lockdep_is_held(&wq_pool_mutex), \
33800 - "sched RCU, wq->mutex or wq_pool_mutex should be held")
33801 + "RCU, wq->mutex or wq_pool_mutex should be held")
33803 #define for_each_cpu_worker_pool(pool, cpu) \
33804 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
33805 @@ -384,7 +393,7 @@
33806 * @pool: iteration cursor
33807 * @pi: integer used for iteration
33809 - * This must be called either with wq_pool_mutex held or sched RCU read
33810 + * This must be called either with wq_pool_mutex held or RCU read
33811 * locked. If the pool needs to be used beyond the locking in effect, the
33812 * caller is responsible for guaranteeing that the pool stays online.
33814 @@ -416,7 +425,7 @@
33815 * @pwq: iteration cursor
33816 * @wq: the target workqueue
33818 - * This must be called either with wq->mutex held or sched RCU read locked.
33819 + * This must be called either with wq->mutex held or RCU read locked.
33820 * If the pwq needs to be used beyond the locking in effect, the caller is
33821 * responsible for guaranteeing that the pwq stays online.
33823 @@ -428,6 +437,31 @@
33824 if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
33827 +#ifdef CONFIG_PREEMPT_RT_BASE
33828 +static inline void rt_lock_idle_list(struct worker_pool *pool)
33830 + preempt_disable();
33832 +static inline void rt_unlock_idle_list(struct worker_pool *pool)
33834 + preempt_enable();
33836 +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
33837 +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
33839 +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
33840 +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
33841 +static inline void sched_lock_idle_list(struct worker_pool *pool)
33843 + spin_lock_irq(&pool->lock);
33845 +static inline void sched_unlock_idle_list(struct worker_pool *pool)
33847 + spin_unlock_irq(&pool->lock);
33852 #ifdef CONFIG_DEBUG_OBJECTS_WORK
33854 static struct debug_obj_descr work_debug_descr;
33855 @@ -552,7 +586,7 @@
33856 * @wq: the target workqueue
33857 * @node: the node ID
33859 - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
33860 + * This must be called with any of wq_pool_mutex, wq->mutex or RCU
33862 * If the pwq needs to be used beyond the locking in effect, the caller is
33863 * responsible for guaranteeing that the pwq stays online.
33864 @@ -696,8 +730,8 @@
33865 * @work: the work item of interest
33867 * Pools are created and destroyed under wq_pool_mutex, and allows read
33868 - * access under sched-RCU read lock. As such, this function should be
33869 - * called under wq_pool_mutex or with preemption disabled.
33870 + * access under RCU read lock. As such, this function should be
33871 + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
33873 * All fields of the returned pool are accessible as long as the above
33874 * mentioned locking is in effect. If the returned pool needs to be used
33875 @@ -834,50 +868,45 @@
33877 static void wake_up_worker(struct worker_pool *pool)
33879 - struct worker *worker = first_idle_worker(pool);
33880 + struct worker *worker;
33882 + rt_lock_idle_list(pool);
33884 + worker = first_idle_worker(pool);
33886 if (likely(worker))
33887 wake_up_process(worker->task);
33889 + rt_unlock_idle_list(pool);
33893 - * wq_worker_waking_up - a worker is waking up
33894 + * wq_worker_running - a worker is running again
33895 * @task: task waking up
33896 - * @cpu: CPU @task is waking up to
33898 - * This function is called during try_to_wake_up() when a worker is
33902 - * spin_lock_irq(rq->lock)
33903 + * This function is called when a worker returns from schedule()
33905 -void wq_worker_waking_up(struct task_struct *task, int cpu)
33906 +void wq_worker_running(struct task_struct *task)
33908 struct worker *worker = kthread_data(task);
33910 - if (!(worker->flags & WORKER_NOT_RUNNING)) {
33911 - WARN_ON_ONCE(worker->pool->cpu != cpu);
33912 + if (!worker->sleeping)
33914 + if (!(worker->flags & WORKER_NOT_RUNNING))
33915 atomic_inc(&worker->pool->nr_running);
33917 + worker->sleeping = 0;
33921 * wq_worker_sleeping - a worker is going to sleep
33922 * @task: task going to sleep
33924 - * This function is called during schedule() when a busy worker is
33925 - * going to sleep. Worker on the same cpu can be woken up by
33926 - * returning pointer to its task.
33929 - * spin_lock_irq(rq->lock)
33932 - * Worker task on @cpu to wake up, %NULL if none.
33933 + * This function is called from schedule() when a busy worker is
33934 + * going to sleep.
33936 -struct task_struct *wq_worker_sleeping(struct task_struct *task)
33937 +void wq_worker_sleeping(struct task_struct *task)
33939 - struct worker *worker = kthread_data(task), *to_wakeup = NULL;
33940 + struct worker *worker = kthread_data(task);
33941 struct worker_pool *pool;
33944 @@ -886,29 +915,26 @@
33945 * checking NOT_RUNNING.
33947 if (worker->flags & WORKER_NOT_RUNNING)
33951 pool = worker->pool;
33953 - /* this can only happen on the local cpu */
33954 - if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
33956 + if (WARN_ON_ONCE(worker->sleeping))
33959 + worker->sleeping = 1;
33962 * The counterpart of the following dec_and_test, implied mb,
33963 * worklist not empty test sequence is in insert_work().
33964 * Please read comment there.
33966 - * NOT_RUNNING is clear. This means that we're bound to and
33967 - * running on the local cpu w/ rq lock held and preemption
33968 - * disabled, which in turn means that none else could be
33969 - * manipulating idle_list, so dereferencing idle_list without pool
33972 if (atomic_dec_and_test(&pool->nr_running) &&
33973 - !list_empty(&pool->worklist))
33974 - to_wakeup = first_idle_worker(pool);
33975 - return to_wakeup ? to_wakeup->task : NULL;
33976 + !list_empty(&pool->worklist)) {
33977 + sched_lock_idle_list(pool);
33978 + wake_up_worker(pool);
33979 + sched_unlock_idle_list(pool);
33984 @@ -1102,12 +1128,14 @@
33988 - * As both pwqs and pools are sched-RCU protected, the
33989 + * As both pwqs and pools are RCU protected, the
33990 * following lock operations are safe.
33992 - spin_lock_irq(&pwq->pool->lock);
33994 + local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
33996 - spin_unlock_irq(&pwq->pool->lock);
33997 + local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
33998 + rcu_read_unlock();
34002 @@ -1211,7 +1239,7 @@
34003 struct worker_pool *pool;
34004 struct pool_workqueue *pwq;
34006 - local_irq_save(*flags);
34007 + local_lock_irqsave(pendingb_lock, *flags);
34009 /* try to steal the timer if it exists */
34011 @@ -1230,6 +1258,7 @@
34012 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
34017 * The queueing is in progress, or it is already queued. Try to
34018 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
34019 @@ -1268,14 +1297,16 @@
34020 set_work_pool_and_keep_pending(work, pool->id);
34022 spin_unlock(&pool->lock);
34023 + rcu_read_unlock();
34026 spin_unlock(&pool->lock);
34028 - local_irq_restore(*flags);
34029 + rcu_read_unlock();
34030 + local_unlock_irqrestore(pendingb_lock, *flags);
34031 if (work_is_canceling(work))
34038 @@ -1377,7 +1408,7 @@
34039 * queued or lose PENDING. Grabbing PENDING and queueing should
34040 * happen with IRQ disabled.
34042 - WARN_ON_ONCE(!irqs_disabled());
34043 + WARN_ON_ONCE_NONRT(!irqs_disabled());
34045 debug_work_activate(work);
34047 @@ -1385,6 +1416,7 @@
34048 if (unlikely(wq->flags & __WQ_DRAINING) &&
34049 WARN_ON_ONCE(!is_chained_work(wq)))
34053 if (req_cpu == WORK_CPU_UNBOUND)
34054 cpu = wq_select_unbound_cpu(raw_smp_processor_id());
34055 @@ -1441,10 +1473,8 @@
34056 /* pwq determined, queue */
34057 trace_workqueue_queue_work(req_cpu, pwq, work);
34059 - if (WARN_ON(!list_empty(&work->entry))) {
34060 - spin_unlock(&pwq->pool->lock);
34063 + if (WARN_ON(!list_empty(&work->entry)))
34066 pwq->nr_in_flight[pwq->work_color]++;
34067 work_flags = work_color_to_flags(pwq->work_color);
34068 @@ -1462,7 +1492,9 @@
34070 insert_work(pwq, work, worklist, work_flags);
34073 spin_unlock(&pwq->pool->lock);
34074 + rcu_read_unlock();
34078 @@ -1482,14 +1514,14 @@
34080 unsigned long flags;
34082 - local_irq_save(flags);
34083 + local_lock_irqsave(pendingb_lock,flags);
34085 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
34086 __queue_work(cpu, wq, work);
34090 - local_irq_restore(flags);
34091 + local_unlock_irqrestore(pendingb_lock, flags);
34094 EXPORT_SYMBOL(queue_work_on);
34095 @@ -1498,8 +1530,11 @@
34097 struct delayed_work *dwork = (struct delayed_work *)__data;
34100 + /* local_lock(pendingb_lock); */
34101 /* should have been called from irqsafe timer with irq already off */
34102 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
34103 + /* local_unlock(pendingb_lock); */
34105 EXPORT_SYMBOL(delayed_work_timer_fn);
34107 @@ -1555,14 +1590,14 @@
34108 unsigned long flags;
34110 /* read the comment in __queue_work() */
34111 - local_irq_save(flags);
34112 + local_lock_irqsave(pendingb_lock, flags);
34114 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
34115 __queue_delayed_work(cpu, wq, dwork, delay);
34119 - local_irq_restore(flags);
34120 + local_unlock_irqrestore(pendingb_lock, flags);
34123 EXPORT_SYMBOL(queue_delayed_work_on);
34124 @@ -1597,7 +1632,7 @@
34126 if (likely(ret >= 0)) {
34127 __queue_delayed_work(cpu, wq, dwork, delay);
34128 - local_irq_restore(flags);
34129 + local_unlock_irqrestore(pendingb_lock, flags);
34132 /* -ENOENT from try_to_grab_pending() becomes %true */
34133 @@ -1630,7 +1665,9 @@
34134 worker->last_active = jiffies;
34136 /* idle_list is LIFO */
34137 + rt_lock_idle_list(pool);
34138 list_add(&worker->entry, &pool->idle_list);
34139 + rt_unlock_idle_list(pool);
34141 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
34142 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
34143 @@ -1663,7 +1700,9 @@
34145 worker_clr_flags(worker, WORKER_IDLE);
34147 + rt_lock_idle_list(pool);
34148 list_del_init(&worker->entry);
34149 + rt_unlock_idle_list(pool);
34152 static struct worker *alloc_worker(int node)
34153 @@ -1829,7 +1868,9 @@
34154 pool->nr_workers--;
34157 + rt_lock_idle_list(pool);
34158 list_del_init(&worker->entry);
34159 + rt_unlock_idle_list(pool);
34160 worker->flags |= WORKER_DIE;
34161 wake_up_process(worker->task);
34163 @@ -2815,14 +2856,14 @@
34167 - local_irq_disable();
34169 pool = get_work_pool(work);
34171 - local_irq_enable();
34172 + rcu_read_unlock();
34176 - spin_lock(&pool->lock);
34177 + spin_lock_irq(&pool->lock);
34178 /* see the comment in try_to_grab_pending() with the same code */
34179 pwq = get_work_pwq(work);
34181 @@ -2853,10 +2894,11 @@
34182 lock_map_acquire(&pwq->wq->lockdep_map);
34183 lock_map_release(&pwq->wq->lockdep_map);
34186 + rcu_read_unlock();
34189 spin_unlock_irq(&pool->lock);
34190 + rcu_read_unlock();
34194 @@ -2946,7 +2988,7 @@
34196 /* tell other tasks trying to grab @work to back off */
34197 mark_work_canceling(work);
34198 - local_irq_restore(flags);
34199 + local_unlock_irqrestore(pendingb_lock, flags);
34202 * This allows canceling during early boot. We know that @work
34203 @@ -3007,10 +3049,10 @@
34205 bool flush_delayed_work(struct delayed_work *dwork)
34207 - local_irq_disable();
34208 + local_lock_irq(pendingb_lock);
34209 if (del_timer_sync(&dwork->timer))
34210 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
34211 - local_irq_enable();
34212 + local_unlock_irq(pendingb_lock);
34213 return flush_work(&dwork->work);
34215 EXPORT_SYMBOL(flush_delayed_work);
34216 @@ -3028,7 +3070,7 @@
34219 set_work_pool_and_clear_pending(work, get_work_pool_id(work));
34220 - local_irq_restore(flags);
34221 + local_unlock_irqrestore(pendingb_lock, flags);
34225 @@ -3284,7 +3326,7 @@
34226 * put_unbound_pool - put a worker_pool
34227 * @pool: worker_pool to put
34229 - * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU
34230 + * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU
34231 * safe manner. get_unbound_pool() calls this function on its failure path
34232 * and this function should be able to release pools which went through,
34233 * successfully or not, init_worker_pool().
34234 @@ -3338,8 +3380,8 @@
34235 del_timer_sync(&pool->idle_timer);
34236 del_timer_sync(&pool->mayday_timer);
34238 - /* sched-RCU protected to allow dereferences from get_work_pool() */
34239 - call_rcu_sched(&pool->rcu, rcu_free_pool);
34240 + /* RCU protected to allow dereferences from get_work_pool() */
34241 + call_rcu(&pool->rcu, rcu_free_pool);
34245 @@ -3446,14 +3488,14 @@
34246 put_unbound_pool(pool);
34247 mutex_unlock(&wq_pool_mutex);
34249 - call_rcu_sched(&pwq->rcu, rcu_free_pwq);
34250 + call_rcu(&pwq->rcu, rcu_free_pwq);
34253 * If we're the last pwq going away, @wq is already dead and no one
34254 * is gonna access it anymore. Schedule RCU free.
34257 - call_rcu_sched(&wq->rcu, rcu_free_wq);
34258 + call_rcu(&wq->rcu, rcu_free_wq);
34262 @@ -4128,7 +4170,7 @@
34263 * The base ref is never dropped on per-cpu pwqs. Directly
34264 * schedule RCU free.
34266 - call_rcu_sched(&wq->rcu, rcu_free_wq);
34267 + call_rcu(&wq->rcu, rcu_free_wq);
34270 * We're the sole accessor of @wq at this point. Directly
34271 @@ -4238,7 +4280,8 @@
34272 struct pool_workqueue *pwq;
34275 - rcu_read_lock_sched();
34277 + preempt_disable();
34279 if (cpu == WORK_CPU_UNBOUND)
34280 cpu = smp_processor_id();
34281 @@ -4249,7 +4292,8 @@
34282 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
34284 ret = !list_empty(&pwq->delayed_works);
34285 - rcu_read_unlock_sched();
34286 + preempt_enable();
34287 + rcu_read_unlock();
34291 @@ -4275,15 +4319,15 @@
34292 if (work_pending(work))
34293 ret |= WORK_BUSY_PENDING;
34295 - local_irq_save(flags);
34297 pool = get_work_pool(work);
34299 - spin_lock(&pool->lock);
34300 + spin_lock_irqsave(&pool->lock, flags);
34301 if (find_worker_executing_work(pool, work))
34302 ret |= WORK_BUSY_RUNNING;
34303 - spin_unlock(&pool->lock);
34304 + spin_unlock_irqrestore(&pool->lock, flags);
34306 - local_irq_restore(flags);
34307 + rcu_read_unlock();
34311 @@ -4472,7 +4516,7 @@
34312 unsigned long flags;
34315 - rcu_read_lock_sched();
34318 pr_info("Showing busy workqueues and worker pools:\n");
34320 @@ -4537,7 +4581,7 @@
34321 touch_nmi_watchdog();
34324 - rcu_read_unlock_sched();
34325 + rcu_read_unlock();
34329 @@ -4898,16 +4942,16 @@
34330 * nr_active is monotonically decreasing. It's safe
34331 * to peek without lock.
34333 - rcu_read_lock_sched();
34335 for_each_pwq(pwq, wq) {
34336 WARN_ON_ONCE(pwq->nr_active < 0);
34337 if (pwq->nr_active) {
34339 - rcu_read_unlock_sched();
34340 + rcu_read_unlock();
34344 - rcu_read_unlock_sched();
34345 + rcu_read_unlock();
34348 mutex_unlock(&wq_pool_mutex);
34349 @@ -5097,7 +5141,8 @@
34350 const char *delim = "";
34351 int node, written = 0;
34353 - rcu_read_lock_sched();
34354 + get_online_cpus();
34356 for_each_node(node) {
34357 written += scnprintf(buf + written, PAGE_SIZE - written,
34358 "%s%d:%d", delim, node,
34359 @@ -5105,7 +5150,8 @@
34362 written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
34363 - rcu_read_unlock_sched();
34364 + rcu_read_unlock();
34365 + put_online_cpus();
34369 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/workqueue_internal.h linux-4.14/kernel/workqueue_internal.h
34370 --- linux-4.14.orig/kernel/workqueue_internal.h 2017-11-12 19:46:13.000000000 +0100
34371 +++ linux-4.14/kernel/workqueue_internal.h 2018-09-05 11:05:07.000000000 +0200
34373 unsigned long last_active; /* L: last active timestamp */
34374 unsigned int flags; /* X: flags */
34375 int id; /* I: worker id */
34376 + int sleeping; /* None */
34379 * Opaque string set with work_set_desc(). Printed out with task
34381 * Scheduler hooks for concurrency managed workqueue. Only to be used from
34382 * sched/core.c and workqueue.c.
34384 -void wq_worker_waking_up(struct task_struct *task, int cpu);
34385 -struct task_struct *wq_worker_sleeping(struct task_struct *task);
34386 +void wq_worker_running(struct task_struct *task);
34387 +void wq_worker_sleeping(struct task_struct *task);
34389 #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
34390 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/debugobjects.c linux-4.14/lib/debugobjects.c
34391 --- linux-4.14.orig/lib/debugobjects.c 2017-11-12 19:46:13.000000000 +0100
34392 +++ linux-4.14/lib/debugobjects.c 2018-09-05 11:05:07.000000000 +0200
34393 @@ -336,7 +336,10 @@
34394 struct debug_obj *obj;
34395 unsigned long flags;
34398 +#ifdef CONFIG_PREEMPT_RT_FULL
34399 + if (preempt_count() == 0 && !irqs_disabled())
34403 db = get_bucket((unsigned long) addr);
34405 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/irq_poll.c linux-4.14/lib/irq_poll.c
34406 --- linux-4.14.orig/lib/irq_poll.c 2017-11-12 19:46:13.000000000 +0100
34407 +++ linux-4.14/lib/irq_poll.c 2018-09-05 11:05:07.000000000 +0200
34409 list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
34410 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
34411 local_irq_restore(flags);
34412 + preempt_check_resched_rt();
34414 EXPORT_SYMBOL(irq_poll_sched);
34417 local_irq_save(flags);
34418 __irq_poll_complete(iop);
34419 local_irq_restore(flags);
34420 + preempt_check_resched_rt();
34422 EXPORT_SYMBOL(irq_poll_complete);
34427 local_irq_enable();
34428 + preempt_check_resched_rt();
34430 /* Even though interrupts have been re-enabled, this
34431 * access is safe because interrupts can only add new
34432 @@ -133,6 +136,7 @@
34433 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
34435 local_irq_enable();
34436 + preempt_check_resched_rt();
34440 @@ -196,6 +200,7 @@
34441 this_cpu_ptr(&blk_cpu_iopoll));
34442 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
34443 local_irq_enable();
34444 + preempt_check_resched_rt();
34448 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/Kconfig linux-4.14/lib/Kconfig
34449 --- linux-4.14.orig/lib/Kconfig 2017-11-12 19:46:13.000000000 +0100
34450 +++ linux-4.14/lib/Kconfig 2018-09-05 11:05:07.000000000 +0200
34451 @@ -428,6 +428,7 @@
34453 config CPUMASK_OFFSTACK
34454 bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
34455 + depends on !PREEMPT_RT_FULL
34457 Use dynamic allocation for cpumask_var_t, instead of putting
34458 them on the stack. This is a bit more expensive, but avoids
34459 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/Kconfig.debug linux-4.14/lib/Kconfig.debug
34460 --- linux-4.14.orig/lib/Kconfig.debug 2018-09-05 11:03:22.000000000 +0200
34461 +++ linux-4.14/lib/Kconfig.debug 2018-09-05 11:05:07.000000000 +0200
34462 @@ -1197,7 +1197,7 @@
34464 config DEBUG_LOCKING_API_SELFTESTS
34465 bool "Locking API boot-time self-tests"
34466 - depends on DEBUG_KERNEL
34467 + depends on DEBUG_KERNEL && !PREEMPT_RT_FULL
34469 Say Y here if you want the kernel to run a short self-test during
34470 bootup. The self-test checks whether common types of locking bugs
34471 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/locking-selftest.c linux-4.14/lib/locking-selftest.c
34472 --- linux-4.14.orig/lib/locking-selftest.c 2017-11-12 19:46:13.000000000 +0100
34473 +++ linux-4.14/lib/locking-selftest.c 2018-09-05 11:05:07.000000000 +0200
34474 @@ -742,6 +742,8 @@
34475 #include "locking-selftest-spin-hardirq.h"
34476 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
34478 +#ifndef CONFIG_PREEMPT_RT_FULL
34480 #include "locking-selftest-rlock-hardirq.h"
34481 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
34483 @@ -757,9 +759,12 @@
34484 #include "locking-selftest-wlock-softirq.h"
34485 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
34492 +#ifndef CONFIG_PREEMPT_RT_FULL
34494 * Enabling hardirqs with a softirq-safe lock held:
34496 @@ -792,6 +797,8 @@
34503 * Enabling irqs with an irq-safe lock held:
34505 @@ -815,6 +822,8 @@
34506 #include "locking-selftest-spin-hardirq.h"
34507 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
34509 +#ifndef CONFIG_PREEMPT_RT_FULL
34511 #include "locking-selftest-rlock-hardirq.h"
34512 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
34514 @@ -830,6 +839,8 @@
34515 #include "locking-selftest-wlock-softirq.h"
34516 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
34523 @@ -861,6 +872,8 @@
34524 #include "locking-selftest-spin-hardirq.h"
34525 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
34527 +#ifndef CONFIG_PREEMPT_RT_FULL
34529 #include "locking-selftest-rlock-hardirq.h"
34530 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
34532 @@ -876,6 +889,8 @@
34533 #include "locking-selftest-wlock-softirq.h"
34534 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
34541 @@ -909,6 +924,8 @@
34542 #include "locking-selftest-spin-hardirq.h"
34543 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
34545 +#ifndef CONFIG_PREEMPT_RT_FULL
34547 #include "locking-selftest-rlock-hardirq.h"
34548 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
34550 @@ -924,10 +941,14 @@
34551 #include "locking-selftest-wlock-softirq.h"
34552 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
34560 +#ifndef CONFIG_PREEMPT_RT_FULL
34563 * read-lock / write-lock irq inversion.
34565 @@ -990,6 +1011,10 @@
34571 +#ifndef CONFIG_PREEMPT_RT_FULL
34574 * read-lock / write-lock recursion that is actually safe.
34576 @@ -1028,6 +1053,8 @@
34583 * read-lock / write-lock recursion that is unsafe.
34585 @@ -2057,6 +2084,7 @@
34587 printk(" --------------------------------------------------------------------------\n");
34589 +#ifndef CONFIG_PREEMPT_RT_FULL
34591 * irq-context testcases:
34593 @@ -2069,6 +2097,28 @@
34595 DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
34596 // DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
34598 + /* On -rt, we only do hardirq context test for raw spinlock */
34599 + DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
34600 + DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
34602 + DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
34603 + DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
34605 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
34606 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
34607 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
34608 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
34609 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
34610 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
34612 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
34613 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
34614 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
34615 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
34616 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
34617 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
34622 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/percpu_ida.c linux-4.14/lib/percpu_ida.c
34623 --- linux-4.14.orig/lib/percpu_ida.c 2017-11-12 19:46:13.000000000 +0100
34624 +++ linux-4.14/lib/percpu_ida.c 2018-09-05 11:05:07.000000000 +0200
34626 #include <linux/string.h>
34627 #include <linux/spinlock.h>
34628 #include <linux/percpu_ida.h>
34629 +#include <linux/locallock.h>
34631 +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
34633 struct percpu_ida_cpu {
34635 @@ -149,13 +152,13 @@
34636 unsigned long flags;
34639 - local_irq_save(flags);
34640 + local_lock_irqsave(irq_off_lock, flags);
34641 tags = this_cpu_ptr(pool->tag_cpu);
34644 tag = alloc_local_tag(tags);
34645 if (likely(tag >= 0)) {
34646 - local_irq_restore(flags);
34647 + local_unlock_irqrestore(irq_off_lock, flags);
34651 @@ -174,6 +177,7 @@
34653 if (!tags->nr_free)
34654 alloc_global_tags(pool, tags);
34656 if (!tags->nr_free)
34657 steal_tags(pool, tags);
34659 @@ -185,7 +189,7 @@
34662 spin_unlock(&pool->lock);
34663 - local_irq_restore(flags);
34664 + local_unlock_irqrestore(irq_off_lock, flags);
34666 if (tag >= 0 || state == TASK_RUNNING)
34668 @@ -197,7 +201,7 @@
34672 - local_irq_save(flags);
34673 + local_lock_irqsave(irq_off_lock, flags);
34674 tags = this_cpu_ptr(pool->tag_cpu);
34676 if (state != TASK_RUNNING)
34677 @@ -222,7 +226,7 @@
34679 BUG_ON(tag >= pool->nr_tags);
34681 - local_irq_save(flags);
34682 + local_lock_irqsave(irq_off_lock, flags);
34683 tags = this_cpu_ptr(pool->tag_cpu);
34685 spin_lock(&tags->lock);
34686 @@ -254,7 +258,7 @@
34687 spin_unlock(&pool->lock);
34690 - local_irq_restore(flags);
34691 + local_unlock_irqrestore(irq_off_lock, flags);
34693 EXPORT_SYMBOL_GPL(percpu_ida_free);
34695 @@ -346,7 +350,7 @@
34696 struct percpu_ida_cpu *remote;
34697 unsigned cpu, i, err = 0;
34699 - local_irq_save(flags);
34700 + local_lock_irqsave(irq_off_lock, flags);
34701 for_each_possible_cpu(cpu) {
34702 remote = per_cpu_ptr(pool->tag_cpu, cpu);
34703 spin_lock(&remote->lock);
34704 @@ -368,7 +372,7 @@
34706 spin_unlock(&pool->lock);
34708 - local_irq_restore(flags);
34709 + local_unlock_irqrestore(irq_off_lock, flags);
34712 EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
34713 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/radix-tree.c linux-4.14/lib/radix-tree.c
34714 --- linux-4.14.orig/lib/radix-tree.c 2018-09-05 11:03:25.000000000 +0200
34715 +++ linux-4.14/lib/radix-tree.c 2018-09-05 11:05:07.000000000 +0200
34717 #include <linux/rcupdate.h>
34718 #include <linux/slab.h>
34719 #include <linux/string.h>
34721 +#include <linux/locallock.h>
34723 /* Number of nodes in fully populated tree of given height */
34724 static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;
34726 struct radix_tree_node *nodes;
34728 static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
34729 +static DEFINE_LOCAL_IRQ_LOCK(radix_tree_preloads_lock);
34731 static inline struct radix_tree_node *entry_to_node(void *ptr)
34733 @@ -404,12 +405,13 @@
34734 * succeed in getting a node here (and never reach
34735 * kmem_cache_alloc)
34737 - rtp = this_cpu_ptr(&radix_tree_preloads);
34738 + rtp = &get_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
34741 rtp->nodes = ret->parent;
34744 + put_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
34746 * Update the allocation stack trace as this is more useful
34748 @@ -475,14 +477,14 @@
34750 gfp_mask &= ~__GFP_ACCOUNT;
34752 - preempt_disable();
34753 + local_lock(radix_tree_preloads_lock);
34754 rtp = this_cpu_ptr(&radix_tree_preloads);
34755 while (rtp->nr < nr) {
34756 - preempt_enable();
34757 + local_unlock(radix_tree_preloads_lock);
34758 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
34761 - preempt_disable();
34762 + local_lock(radix_tree_preloads_lock);
34763 rtp = this_cpu_ptr(&radix_tree_preloads);
34764 if (rtp->nr < nr) {
34765 node->parent = rtp->nodes;
34766 @@ -524,7 +526,7 @@
34767 if (gfpflags_allow_blocking(gfp_mask))
34768 return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
34769 /* Preloading doesn't help anything with this gfp mask, skip it */
34770 - preempt_disable();
34771 + local_lock(radix_tree_preloads_lock);
34774 EXPORT_SYMBOL(radix_tree_maybe_preload);
34775 @@ -562,7 +564,7 @@
34777 /* Preloading doesn't help anything with this gfp mask, skip it */
34778 if (!gfpflags_allow_blocking(gfp_mask)) {
34779 - preempt_disable();
34780 + local_lock(radix_tree_preloads_lock);
34784 @@ -596,6 +598,12 @@
34785 return __radix_tree_preload(gfp_mask, nr_nodes);
34788 +void radix_tree_preload_end(void)
34790 + local_unlock(radix_tree_preloads_lock);
34792 +EXPORT_SYMBOL(radix_tree_preload_end);
34794 static unsigned radix_tree_load_root(const struct radix_tree_root *root,
34795 struct radix_tree_node **nodep, unsigned long *maxindex)
34797 @@ -2105,10 +2113,16 @@
34798 void idr_preload(gfp_t gfp_mask)
34800 if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE))
34801 - preempt_disable();
34802 + local_lock(radix_tree_preloads_lock);
34804 EXPORT_SYMBOL(idr_preload);
34806 +void idr_preload_end(void)
34808 + local_unlock(radix_tree_preloads_lock);
34810 +EXPORT_SYMBOL(idr_preload_end);
34813 * ida_pre_get - reserve resources for ida allocation
34815 @@ -2125,7 +2139,7 @@
34816 * to return to the ida_pre_get() step.
34818 if (!__radix_tree_preload(gfp, IDA_PRELOAD_SIZE))
34819 - preempt_enable();
34820 + local_unlock(radix_tree_preloads_lock);
34822 if (!this_cpu_read(ida_bitmap)) {
34823 struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp);
34824 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/scatterlist.c linux-4.14/lib/scatterlist.c
34825 --- linux-4.14.orig/lib/scatterlist.c 2017-11-12 19:46:13.000000000 +0100
34826 +++ linux-4.14/lib/scatterlist.c 2018-09-05 11:05:07.000000000 +0200
34827 @@ -620,7 +620,7 @@
34828 flush_kernel_dcache_page(miter->page);
34830 if (miter->__flags & SG_MITER_ATOMIC) {
34831 - WARN_ON_ONCE(preemptible());
34832 + WARN_ON_ONCE(!pagefault_disabled());
34833 kunmap_atomic(miter->addr);
34835 kunmap(miter->page);
34836 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/smp_processor_id.c linux-4.14/lib/smp_processor_id.c
34837 --- linux-4.14.orig/lib/smp_processor_id.c 2017-11-12 19:46:13.000000000 +0100
34838 +++ linux-4.14/lib/smp_processor_id.c 2018-09-05 11:05:07.000000000 +0200
34840 * Kernel threads bound to a single CPU can safely use
34841 * smp_processor_id():
34843 - if (cpumask_equal(¤t->cpus_allowed, cpumask_of(this_cpu)))
34844 + if (cpumask_equal(current->cpus_ptr, cpumask_of(this_cpu)))
34848 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/timerqueue.c linux-4.14/lib/timerqueue.c
34849 --- linux-4.14.orig/lib/timerqueue.c 2017-11-12 19:46:13.000000000 +0100
34850 +++ linux-4.14/lib/timerqueue.c 2018-09-05 11:05:07.000000000 +0200
34852 * @head: head of timerqueue
34853 * @node: timer node to be added
34855 - * Adds the timer node to the timerqueue, sorted by the
34856 - * node's expires value.
34857 + * Adds the timer node to the timerqueue, sorted by the node's expires
34858 + * value. Returns true if the newly added timer is the first expiring timer in
34861 bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
34864 * @head: head of timerqueue
34865 * @node: timer node to be removed
34867 - * Removes the timer node from the timerqueue.
34868 + * Removes the timer node from the timerqueue. Returns true if the queue is
34869 + * not empty after the remove.
34871 bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
34873 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/localversion-rt linux-4.14/localversion-rt
34874 --- linux-4.14.orig/localversion-rt 1970-01-01 01:00:00.000000000 +0100
34875 +++ linux-4.14/localversion-rt 2018-09-05 11:05:07.000000000 +0200
34878 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/backing-dev.c linux-4.14/mm/backing-dev.c
34879 --- linux-4.14.orig/mm/backing-dev.c 2018-09-05 11:03:25.000000000 +0200
34880 +++ linux-4.14/mm/backing-dev.c 2018-09-05 11:05:07.000000000 +0200
34881 @@ -470,9 +470,9 @@
34883 unsigned long flags;
34885 - local_irq_save(flags);
34886 + local_irq_save_nort(flags);
34887 if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
34888 - local_irq_restore(flags);
34889 + local_irq_restore_nort(flags);
34893 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/compaction.c linux-4.14/mm/compaction.c
34894 --- linux-4.14.orig/mm/compaction.c 2017-11-12 19:46:13.000000000 +0100
34895 +++ linux-4.14/mm/compaction.c 2018-09-05 11:05:07.000000000 +0200
34896 @@ -1634,10 +1634,12 @@
34897 block_start_pfn(cc->migrate_pfn, cc->order);
34899 if (cc->last_migrated_pfn < current_block_start) {
34901 + cpu = get_cpu_light();
34902 + local_lock_irq(swapvec_lock);
34903 lru_add_drain_cpu(cpu);
34904 + local_unlock_irq(swapvec_lock);
34905 drain_local_pages(zone);
34908 /* No more flushing until we migrate again */
34909 cc->last_migrated_pfn = 0;
34911 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/filemap.c linux-4.14/mm/filemap.c
34912 --- linux-4.14.orig/mm/filemap.c 2018-09-05 11:03:28.000000000 +0200
34913 +++ linux-4.14/mm/filemap.c 2018-09-05 11:05:07.000000000 +0200
34914 @@ -110,6 +110,7 @@
34916 * ->tasklist_lock (memory_failure, collect_procs_ao)
34918 +DECLARE_LOCAL_IRQ_LOCK(shadow_nodes_lock);
34920 static int page_cache_tree_insert(struct address_space *mapping,
34921 struct page *page, void **shadowp)
34922 @@ -133,8 +134,10 @@
34926 + local_lock(shadow_nodes_lock);
34927 __radix_tree_replace(&mapping->page_tree, node, slot, page,
34928 - workingset_update_node, mapping);
34929 + __workingset_update_node, mapping);
34930 + local_unlock(shadow_nodes_lock);
34931 mapping->nrpages++;
34934 @@ -151,6 +154,7 @@
34935 VM_BUG_ON_PAGE(PageTail(page), page);
34936 VM_BUG_ON_PAGE(nr != 1 && shadow, page);
34938 + local_lock(shadow_nodes_lock);
34939 for (i = 0; i < nr; i++) {
34940 struct radix_tree_node *node;
34942 @@ -162,8 +166,9 @@
34944 radix_tree_clear_tags(&mapping->page_tree, node, slot);
34945 __radix_tree_replace(&mapping->page_tree, node, slot, shadow,
34946 - workingset_update_node, mapping);
34947 + __workingset_update_node, mapping);
34949 + local_unlock(shadow_nodes_lock);
34952 mapping->nrexceptional += nr;
34953 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/highmem.c linux-4.14/mm/highmem.c
34954 --- linux-4.14.orig/mm/highmem.c 2017-11-12 19:46:13.000000000 +0100
34955 +++ linux-4.14/mm/highmem.c 2018-09-05 11:05:07.000000000 +0200
34956 @@ -30,10 +30,11 @@
34957 #include <linux/kgdb.h>
34958 #include <asm/tlbflush.h>
34961 +#ifndef CONFIG_PREEMPT_RT_FULL
34962 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
34963 DEFINE_PER_CPU(int, __kmap_atomic_idx);
34968 * Virtual_count is not a pure "count".
34969 @@ -108,8 +109,9 @@
34970 unsigned long totalhigh_pages __read_mostly;
34971 EXPORT_SYMBOL(totalhigh_pages);
34974 +#ifndef CONFIG_PREEMPT_RT_FULL
34975 EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
34978 unsigned int nr_free_highpages (void)
34980 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/Kconfig linux-4.14/mm/Kconfig
34981 --- linux-4.14.orig/mm/Kconfig 2018-09-05 11:03:25.000000000 +0200
34982 +++ linux-4.14/mm/Kconfig 2018-09-05 11:05:07.000000000 +0200
34983 @@ -385,7 +385,7 @@
34985 config TRANSPARENT_HUGEPAGE
34986 bool "Transparent Hugepage Support"
34987 - depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
34988 + depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
34990 select RADIX_TREE_MULTIORDER
34992 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/memcontrol.c linux-4.14/mm/memcontrol.c
34993 --- linux-4.14.orig/mm/memcontrol.c 2018-09-05 11:03:25.000000000 +0200
34994 +++ linux-4.14/mm/memcontrol.c 2018-09-05 11:05:07.000000000 +0200
34996 #include <net/sock.h>
34997 #include <net/ip.h>
34999 +#include <linux/locallock.h>
35001 #include <linux/uaccess.h>
35004 #define do_swap_account 0
35007 +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
35009 /* Whether legacy memory+swap accounting is active */
35010 static bool do_memsw_account(void)
35012 @@ -1831,7 +1834,7 @@
35013 * as well as workers from this path always operate on the local
35014 * per-cpu data. CPU up doesn't touch memcg_stock at all.
35016 - curcpu = get_cpu();
35017 + curcpu = get_cpu_light();
35018 for_each_online_cpu(cpu) {
35019 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
35020 struct mem_cgroup *memcg;
35021 @@ -1851,7 +1854,7 @@
35023 css_put(&memcg->css);
35027 mutex_unlock(&percpu_charge_mutex);
35030 @@ -4624,12 +4627,12 @@
35034 - local_irq_disable();
35035 + local_lock_irq(event_lock);
35036 mem_cgroup_charge_statistics(to, page, compound, nr_pages);
35037 memcg_check_events(to, page);
35038 mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
35039 memcg_check_events(from, page);
35040 - local_irq_enable();
35041 + local_unlock_irq(event_lock);
35045 @@ -5572,10 +5575,10 @@
35047 commit_charge(page, memcg, lrucare);
35049 - local_irq_disable();
35050 + local_lock_irq(event_lock);
35051 mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
35052 memcg_check_events(memcg, page);
35053 - local_irq_enable();
35054 + local_unlock_irq(event_lock);
35056 if (do_memsw_account() && PageSwapCache(page)) {
35057 swp_entry_t entry = { .val = page_private(page) };
35058 @@ -5644,7 +5647,7 @@
35059 memcg_oom_recover(ug->memcg);
35062 - local_irq_save(flags);
35063 + local_lock_irqsave(event_lock, flags);
35064 __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
35065 __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
35066 __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
35067 @@ -5652,7 +5655,7 @@
35068 __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
35069 __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
35070 memcg_check_events(ug->memcg, ug->dummy_page);
35071 - local_irq_restore(flags);
35072 + local_unlock_irqrestore(event_lock, flags);
35074 if (!mem_cgroup_is_root(ug->memcg))
35075 css_put_many(&ug->memcg->css, nr_pages);
35076 @@ -5815,10 +5818,10 @@
35078 commit_charge(newpage, memcg, false);
35080 - local_irq_save(flags);
35081 + local_lock_irqsave(event_lock, flags);
35082 mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
35083 memcg_check_events(memcg, newpage);
35084 - local_irq_restore(flags);
35085 + local_unlock_irqrestore(event_lock, flags);
35088 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
35089 @@ -6010,6 +6013,7 @@
35090 struct mem_cgroup *memcg, *swap_memcg;
35091 unsigned int nr_entries;
35092 unsigned short oldid;
35093 + unsigned long flags;
35095 VM_BUG_ON_PAGE(PageLRU(page), page);
35096 VM_BUG_ON_PAGE(page_count(page), page);
35097 @@ -6055,13 +6059,17 @@
35098 * important here to have the interrupts disabled because it is the
35099 * only synchronisation we have for udpating the per-CPU variables.
35101 + local_lock_irqsave(event_lock, flags);
35102 +#ifndef CONFIG_PREEMPT_RT_BASE
35103 VM_BUG_ON(!irqs_disabled());
35105 mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
35107 memcg_check_events(memcg, page);
35109 if (!mem_cgroup_is_root(memcg))
35110 css_put_many(&memcg->css, nr_entries);
35111 + local_unlock_irqrestore(event_lock, flags);
35115 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/mmu_context.c linux-4.14/mm/mmu_context.c
35116 --- linux-4.14.orig/mm/mmu_context.c 2017-11-12 19:46:13.000000000 +0100
35117 +++ linux-4.14/mm/mmu_context.c 2018-09-05 11:05:07.000000000 +0200
35119 struct task_struct *tsk = current;
35122 + preempt_disable_rt();
35123 active_mm = tsk->active_mm;
35124 if (active_mm != mm) {
35129 switch_mm(active_mm, mm, tsk);
35130 + preempt_enable_rt();
35132 #ifdef finish_arch_post_lock_switch
35133 finish_arch_post_lock_switch();
35134 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/page_alloc.c linux-4.14/mm/page_alloc.c
35135 --- linux-4.14.orig/mm/page_alloc.c 2018-09-05 11:03:25.000000000 +0200
35136 +++ linux-4.14/mm/page_alloc.c 2018-09-05 11:05:07.000000000 +0200
35138 #include <linux/hugetlb.h>
35139 #include <linux/sched/rt.h>
35140 #include <linux/sched/mm.h>
35141 +#include <linux/locallock.h>
35142 #include <linux/page_owner.h>
35143 #include <linux/kthread.h>
35144 #include <linux/memcontrol.h>
35145 @@ -286,6 +287,18 @@
35146 EXPORT_SYMBOL(nr_online_nodes);
35149 +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
35151 +#ifdef CONFIG_PREEMPT_RT_BASE
35152 +# define cpu_lock_irqsave(cpu, flags) \
35153 + local_lock_irqsave_on(pa_lock, flags, cpu)
35154 +# define cpu_unlock_irqrestore(cpu, flags) \
35155 + local_unlock_irqrestore_on(pa_lock, flags, cpu)
35157 +# define cpu_lock_irqsave(cpu, flags) local_irq_save(flags)
35158 +# define cpu_unlock_irqrestore(cpu, flags) local_irq_restore(flags)
35161 int page_group_by_mobility_disabled __read_mostly;
35163 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
35164 @@ -1094,7 +1107,7 @@
35165 #endif /* CONFIG_DEBUG_VM */
35168 - * Frees a number of pages from the PCP lists
35169 + * Frees a number of pages which have been collected from the pcp lists.
35170 * Assumes all pages on list are in same zone, and of same order.
35171 * count is the number of pages to free.
35173 @@ -1105,15 +1118,53 @@
35174 * pinned" detection logic.
35176 static void free_pcppages_bulk(struct zone *zone, int count,
35177 - struct per_cpu_pages *pcp)
35178 + struct list_head *list)
35180 - int migratetype = 0;
35181 - int batch_free = 0;
35182 bool isolated_pageblocks;
35183 + unsigned long flags;
35185 - spin_lock(&zone->lock);
35186 + spin_lock_irqsave(&zone->lock, flags);
35187 isolated_pageblocks = has_isolate_pageblock(zone);
35189 + while (!list_empty(list)) {
35190 + struct page *page;
35191 + int mt; /* migratetype of the to-be-freed page */
35193 + page = list_first_entry(list, struct page, lru);
35194 + /* must delete as __free_one_page list manipulates */
35195 + list_del(&page->lru);
35197 + mt = get_pcppage_migratetype(page);
35198 + /* MIGRATE_ISOLATE page should not go to pcplists */
35199 + VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
35200 + /* Pageblock could have been isolated meanwhile */
35201 + if (unlikely(isolated_pageblocks))
35202 + mt = get_pageblock_migratetype(page);
35204 + if (bulkfree_pcp_prepare(page))
35207 + __free_one_page(page, page_to_pfn(page), zone, 0, mt);
35208 + trace_mm_page_pcpu_drain(page, 0, mt);
35211 + WARN_ON(count != 0);
35212 + spin_unlock_irqrestore(&zone->lock, flags);
35216 + * Moves a number of pages from the PCP lists to free list which
35217 + * is freed outside of the locked region.
35219 + * Assumes all pages on list are in same zone, and of same order.
35220 + * count is the number of pages to free.
35222 +static void isolate_pcp_pages(int count, struct per_cpu_pages *src,
35223 + struct list_head *dst)
35225 + int migratetype = 0;
35226 + int batch_free = 0;
35230 struct list_head *list;
35231 @@ -1129,7 +1180,7 @@
35233 if (++migratetype == MIGRATE_PCPTYPES)
35235 - list = &pcp->lists[migratetype];
35236 + list = &src->lists[migratetype];
35237 } while (list_empty(list));
35239 /* This is the only non-empty list. Free them all. */
35240 @@ -1137,27 +1188,12 @@
35241 batch_free = count;
35244 - int mt; /* migratetype of the to-be-freed page */
35246 page = list_last_entry(list, struct page, lru);
35247 - /* must delete as __free_one_page list manipulates */
35248 list_del(&page->lru);
35250 - mt = get_pcppage_migratetype(page);
35251 - /* MIGRATE_ISOLATE page should not go to pcplists */
35252 - VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
35253 - /* Pageblock could have been isolated meanwhile */
35254 - if (unlikely(isolated_pageblocks))
35255 - mt = get_pageblock_migratetype(page);
35257 - if (bulkfree_pcp_prepare(page))
35260 - __free_one_page(page, page_to_pfn(page), zone, 0, mt);
35261 - trace_mm_page_pcpu_drain(page, 0, mt);
35262 + list_add(&page->lru, dst);
35263 } while (--count && --batch_free && !list_empty(list));
35265 - spin_unlock(&zone->lock);
35268 static void free_one_page(struct zone *zone,
35269 @@ -1165,13 +1201,15 @@
35270 unsigned int order,
35273 - spin_lock(&zone->lock);
35274 + unsigned long flags;
35276 + spin_lock_irqsave(&zone->lock, flags);
35277 if (unlikely(has_isolate_pageblock(zone) ||
35278 is_migrate_isolate(migratetype))) {
35279 migratetype = get_pfnblock_migratetype(page, pfn);
35281 __free_one_page(page, pfn, zone, order, migratetype);
35282 - spin_unlock(&zone->lock);
35283 + spin_unlock_irqrestore(&zone->lock, flags);
35286 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
35287 @@ -1257,10 +1295,10 @@
35290 migratetype = get_pfnblock_migratetype(page, pfn);
35291 - local_irq_save(flags);
35292 + local_lock_irqsave(pa_lock, flags);
35293 __count_vm_events(PGFREE, 1 << order);
35294 free_one_page(page_zone(page), page, pfn, order, migratetype);
35295 - local_irq_restore(flags);
35296 + local_unlock_irqrestore(pa_lock, flags);
35299 static void __init __free_pages_boot_core(struct page *page, unsigned int order)
35300 @@ -2378,16 +2416,18 @@
35301 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
35303 unsigned long flags;
35305 int to_drain, batch;
35307 - local_irq_save(flags);
35308 + local_lock_irqsave(pa_lock, flags);
35309 batch = READ_ONCE(pcp->batch);
35310 to_drain = min(pcp->count, batch);
35311 if (to_drain > 0) {
35312 - free_pcppages_bulk(zone, to_drain, pcp);
35313 + isolate_pcp_pages(to_drain, pcp, &dst);
35314 pcp->count -= to_drain;
35316 - local_irq_restore(flags);
35317 + local_unlock_irqrestore(pa_lock, flags);
35318 + free_pcppages_bulk(zone, to_drain, &dst);
35322 @@ -2403,16 +2443,21 @@
35323 unsigned long flags;
35324 struct per_cpu_pageset *pset;
35325 struct per_cpu_pages *pcp;
35329 - local_irq_save(flags);
35330 + cpu_lock_irqsave(cpu, flags);
35331 pset = per_cpu_ptr(zone->pageset, cpu);
35334 - if (pcp->count) {
35335 - free_pcppages_bulk(zone, pcp->count, pcp);
35336 + count = pcp->count;
35338 + isolate_pcp_pages(count, pcp, &dst);
35341 - local_irq_restore(flags);
35342 + cpu_unlock_irqrestore(cpu, flags);
35344 + free_pcppages_bulk(zone, count, &dst);
35348 @@ -2447,6 +2492,7 @@
35352 +#ifndef CONFIG_PREEMPT_RT_BASE
35353 static void drain_local_pages_wq(struct work_struct *work)
35356 @@ -2460,6 +2506,7 @@
35357 drain_local_pages(NULL);
35363 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
35364 @@ -2526,7 +2573,14 @@
35366 cpumask_clear_cpu(cpu, &cpus_with_pcps);
35369 +#ifdef CONFIG_PREEMPT_RT_BASE
35370 + for_each_cpu(cpu, &cpus_with_pcps) {
35372 + drain_pages_zone(cpu, zone);
35374 + drain_pages(cpu);
35377 for_each_cpu(cpu, &cpus_with_pcps) {
35378 struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
35379 INIT_WORK(work, drain_local_pages_wq);
35380 @@ -2534,6 +2588,7 @@
35382 for_each_cpu(cpu, &cpus_with_pcps)
35383 flush_work(per_cpu_ptr(&pcpu_drain, cpu));
35386 mutex_unlock(&pcpu_drain_mutex);
35388 @@ -2610,7 +2665,7 @@
35390 migratetype = get_pfnblock_migratetype(page, pfn);
35391 set_pcppage_migratetype(page, migratetype);
35392 - local_irq_save(flags);
35393 + local_lock_irqsave(pa_lock, flags);
35394 __count_vm_event(PGFREE);
35397 @@ -2636,12 +2691,17 @@
35399 if (pcp->count >= pcp->high) {
35400 unsigned long batch = READ_ONCE(pcp->batch);
35401 - free_pcppages_bulk(zone, batch, pcp);
35404 + isolate_pcp_pages(batch, pcp, &dst);
35405 pcp->count -= batch;
35406 + local_unlock_irqrestore(pa_lock, flags);
35407 + free_pcppages_bulk(zone, batch, &dst);
35412 - local_irq_restore(flags);
35413 + local_unlock_irqrestore(pa_lock, flags);
35417 @@ -2789,7 +2849,7 @@
35419 unsigned long flags;
35421 - local_irq_save(flags);
35422 + local_lock_irqsave(pa_lock, flags);
35423 pcp = &this_cpu_ptr(zone->pageset)->pcp;
35424 list = &pcp->lists[migratetype];
35425 page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
35426 @@ -2797,7 +2857,7 @@
35427 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
35428 zone_statistics(preferred_zone, zone);
35430 - local_irq_restore(flags);
35431 + local_unlock_irqrestore(pa_lock, flags);
35435 @@ -2824,7 +2884,7 @@
35436 * allocate greater than order-1 page units with __GFP_NOFAIL.
35438 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
35439 - spin_lock_irqsave(&zone->lock, flags);
35440 + local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
35444 @@ -2844,14 +2904,14 @@
35446 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
35447 zone_statistics(preferred_zone, zone);
35448 - local_irq_restore(flags);
35449 + local_unlock_irqrestore(pa_lock, flags);
35452 VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
35456 - local_irq_restore(flags);
35457 + local_unlock_irqrestore(pa_lock, flags);
35461 @@ -6778,8 +6838,9 @@
35463 static int page_alloc_cpu_dead(unsigned int cpu)
35466 + local_lock_irq_on(swapvec_lock, cpu);
35467 lru_add_drain_cpu(cpu);
35468 + local_unlock_irq_on(swapvec_lock, cpu);
35472 @@ -7683,7 +7744,7 @@
35473 struct per_cpu_pageset *pset;
35475 /* avoid races with drain_pages() */
35476 - local_irq_save(flags);
35477 + local_lock_irqsave(pa_lock, flags);
35478 if (zone->pageset != &boot_pageset) {
35479 for_each_online_cpu(cpu) {
35480 pset = per_cpu_ptr(zone->pageset, cpu);
35481 @@ -7692,7 +7753,7 @@
35482 free_percpu(zone->pageset);
35483 zone->pageset = &boot_pageset;
35485 - local_irq_restore(flags);
35486 + local_unlock_irqrestore(pa_lock, flags);
35489 #ifdef CONFIG_MEMORY_HOTREMOVE
35490 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/slab.h linux-4.14/mm/slab.h
35491 --- linux-4.14.orig/mm/slab.h 2018-09-05 11:03:25.000000000 +0200
35492 +++ linux-4.14/mm/slab.h 2018-09-05 11:05:07.000000000 +0200
35493 @@ -451,7 +451,11 @@
35494 * The slab lists for all objects.
35496 struct kmem_cache_node {
35497 +#ifdef CONFIG_SLUB
35498 + raw_spinlock_t list_lock;
35500 spinlock_t list_lock;
35504 struct list_head slabs_partial; /* partial list first, better asm code */
35505 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/slub.c linux-4.14/mm/slub.c
35506 --- linux-4.14.orig/mm/slub.c 2018-09-05 11:03:25.000000000 +0200
35507 +++ linux-4.14/mm/slub.c 2018-09-05 11:05:07.000000000 +0200
35508 @@ -1179,7 +1179,7 @@
35509 unsigned long uninitialized_var(flags);
35512 - spin_lock_irqsave(&n->list_lock, flags);
35513 + raw_spin_lock_irqsave(&n->list_lock, flags);
35516 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
35517 @@ -1214,7 +1214,7 @@
35521 - spin_unlock_irqrestore(&n->list_lock, flags);
35522 + raw_spin_unlock_irqrestore(&n->list_lock, flags);
35524 slab_fix(s, "Object at 0x%p not freed", object);
35526 @@ -1342,6 +1342,12 @@
35528 #endif /* CONFIG_SLUB_DEBUG */
35530 +struct slub_free_list {
35531 + raw_spinlock_t lock;
35532 + struct list_head list;
35534 +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
35537 * Hooks for other subsystems that check memory allocations. In a typical
35538 * production configuration these hooks all should produce no code at all.
35539 @@ -1561,10 +1567,17 @@
35543 + bool enableirqs = false;
35545 flags &= gfp_allowed_mask;
35547 if (gfpflags_allow_blocking(flags))
35548 + enableirqs = true;
35549 +#ifdef CONFIG_PREEMPT_RT_FULL
35550 + if (system_state > SYSTEM_BOOTING)
35551 + enableirqs = true;
35554 local_irq_enable();
35556 flags |= s->allocflags;
35557 @@ -1623,7 +1636,7 @@
35561 - if (gfpflags_allow_blocking(flags))
35563 local_irq_disable();
35566 @@ -1681,6 +1694,16 @@
35567 __free_pages(page, order);
35570 +static void free_delayed(struct list_head *h)
35572 + while(!list_empty(h)) {
35573 + struct page *page = list_first_entry(h, struct page, lru);
35575 + list_del(&page->lru);
35576 + __free_slab(page->slab_cache, page);
35580 #define need_reserve_slab_rcu \
35581 (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
35583 @@ -1712,6 +1735,12 @@
35586 call_rcu(head, rcu_free_slab);
35587 + } else if (irqs_disabled()) {
35588 + struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
35590 + raw_spin_lock(&f->lock);
35591 + list_add(&page->lru, &f->list);
35592 + raw_spin_unlock(&f->lock);
35594 __free_slab(s, page);
35596 @@ -1819,7 +1848,7 @@
35597 if (!n || !n->nr_partial)
35600 - spin_lock(&n->list_lock);
35601 + raw_spin_lock(&n->list_lock);
35602 list_for_each_entry_safe(page, page2, &n->partial, lru) {
35605 @@ -1844,7 +1873,7 @@
35609 - spin_unlock(&n->list_lock);
35610 + raw_spin_unlock(&n->list_lock);
35614 @@ -2090,7 +2119,7 @@
35615 * that acquire_slab() will see a slab page that
35618 - spin_lock(&n->list_lock);
35619 + raw_spin_lock(&n->list_lock);
35623 @@ -2101,7 +2130,7 @@
35624 * slabs from diagnostic functions will not see
35625 * any frozen slabs.
35627 - spin_lock(&n->list_lock);
35628 + raw_spin_lock(&n->list_lock);
35632 @@ -2136,7 +2165,7 @@
35636 - spin_unlock(&n->list_lock);
35637 + raw_spin_unlock(&n->list_lock);
35640 stat(s, DEACTIVATE_EMPTY);
35641 @@ -2171,10 +2200,10 @@
35642 n2 = get_node(s, page_to_nid(page));
35645 - spin_unlock(&n->list_lock);
35646 + raw_spin_unlock(&n->list_lock);
35649 - spin_lock(&n->list_lock);
35650 + raw_spin_lock(&n->list_lock);
35654 @@ -2203,7 +2232,7 @@
35658 - spin_unlock(&n->list_lock);
35659 + raw_spin_unlock(&n->list_lock);
35661 while (discard_page) {
35662 page = discard_page;
35663 @@ -2242,14 +2271,21 @@
35664 pobjects = oldpage->pobjects;
35665 pages = oldpage->pages;
35666 if (drain && pobjects > s->cpu_partial) {
35667 + struct slub_free_list *f;
35668 unsigned long flags;
35669 + LIST_HEAD(tofree);
35671 * partial array is full. Move the existing
35672 * set to the per node partial list.
35674 local_irq_save(flags);
35675 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
35676 + f = this_cpu_ptr(&slub_free_list);
35677 + raw_spin_lock(&f->lock);
35678 + list_splice_init(&f->list, &tofree);
35679 + raw_spin_unlock(&f->lock);
35680 local_irq_restore(flags);
35681 + free_delayed(&tofree);
35685 @@ -2319,7 +2355,22 @@
35687 static void flush_all(struct kmem_cache *s)
35689 + LIST_HEAD(tofree);
35692 on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
35693 + for_each_online_cpu(cpu) {
35694 + struct slub_free_list *f;
35696 + if (!has_cpu_slab(cpu, s))
35699 + f = &per_cpu(slub_free_list, cpu);
35700 + raw_spin_lock_irq(&f->lock);
35701 + list_splice_init(&f->list, &tofree);
35702 + raw_spin_unlock_irq(&f->lock);
35703 + free_delayed(&tofree);
35708 @@ -2374,10 +2425,10 @@
35709 unsigned long x = 0;
35712 - spin_lock_irqsave(&n->list_lock, flags);
35713 + raw_spin_lock_irqsave(&n->list_lock, flags);
35714 list_for_each_entry(page, &n->partial, lru)
35715 x += get_count(page);
35716 - spin_unlock_irqrestore(&n->list_lock, flags);
35717 + raw_spin_unlock_irqrestore(&n->list_lock, flags);
35720 #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
35721 @@ -2515,8 +2566,10 @@
35722 * already disabled (which is the case for bulk allocation).
35724 static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
35725 - unsigned long addr, struct kmem_cache_cpu *c)
35726 + unsigned long addr, struct kmem_cache_cpu *c,
35727 + struct list_head *to_free)
35729 + struct slub_free_list *f;
35733 @@ -2572,6 +2625,13 @@
35734 VM_BUG_ON(!c->page->frozen);
35735 c->freelist = get_freepointer(s, freelist);
35736 c->tid = next_tid(c->tid);
35739 + f = this_cpu_ptr(&slub_free_list);
35740 + raw_spin_lock(&f->lock);
35741 + list_splice_init(&f->list, to_free);
35742 + raw_spin_unlock(&f->lock);
35747 @@ -2587,7 +2647,7 @@
35749 if (unlikely(!freelist)) {
35750 slab_out_of_memory(s, gfpflags, node);
35756 @@ -2600,7 +2660,7 @@
35757 goto new_slab; /* Slab failed checks. Next slab needed */
35759 deactivate_slab(s, page, get_freepointer(s, freelist), c);
35765 @@ -2612,6 +2672,7 @@
35768 unsigned long flags;
35769 + LIST_HEAD(tofree);
35771 local_irq_save(flags);
35772 #ifdef CONFIG_PREEMPT
35773 @@ -2623,8 +2684,9 @@
35774 c = this_cpu_ptr(s->cpu_slab);
35777 - p = ___slab_alloc(s, gfpflags, node, addr, c);
35778 + p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
35779 local_irq_restore(flags);
35780 + free_delayed(&tofree);
35784 @@ -2810,7 +2872,7 @@
35788 - spin_unlock_irqrestore(&n->list_lock, flags);
35789 + raw_spin_unlock_irqrestore(&n->list_lock, flags);
35792 prior = page->freelist;
35793 @@ -2842,7 +2904,7 @@
35794 * Otherwise the list_lock will synchronize with
35795 * other processors updating the list of slabs.
35797 - spin_lock_irqsave(&n->list_lock, flags);
35798 + raw_spin_lock_irqsave(&n->list_lock, flags);
35802 @@ -2884,7 +2946,7 @@
35803 add_partial(n, page, DEACTIVATE_TO_TAIL);
35804 stat(s, FREE_ADD_PARTIAL);
35806 - spin_unlock_irqrestore(&n->list_lock, flags);
35807 + raw_spin_unlock_irqrestore(&n->list_lock, flags);
35811 @@ -2899,7 +2961,7 @@
35812 remove_full(s, n, page);
35815 - spin_unlock_irqrestore(&n->list_lock, flags);
35816 + raw_spin_unlock_irqrestore(&n->list_lock, flags);
35817 stat(s, FREE_SLAB);
35818 discard_slab(s, page);
35820 @@ -3104,6 +3166,7 @@
35823 struct kmem_cache_cpu *c;
35824 + LIST_HEAD(to_free);
35827 /* memcg and kmem_cache debug support */
35828 @@ -3127,7 +3190,7 @@
35829 * of re-populating per CPU c->freelist
35831 p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
35833 + _RET_IP_, c, &to_free);
35834 if (unlikely(!p[i]))
35837 @@ -3139,6 +3202,7 @@
35839 c->tid = next_tid(c->tid);
35840 local_irq_enable();
35841 + free_delayed(&to_free);
35843 /* Clear memory outside IRQ disabled fastpath loop */
35844 if (unlikely(flags & __GFP_ZERO)) {
35845 @@ -3153,6 +3217,7 @@
35848 local_irq_enable();
35849 + free_delayed(&to_free);
35850 slab_post_alloc_hook(s, flags, i, p);
35851 __kmem_cache_free_bulk(s, i, p);
35853 @@ -3286,7 +3351,7 @@
35854 init_kmem_cache_node(struct kmem_cache_node *n)
35857 - spin_lock_init(&n->list_lock);
35858 + raw_spin_lock_init(&n->list_lock);
35859 INIT_LIST_HEAD(&n->partial);
35860 #ifdef CONFIG_SLUB_DEBUG
35861 atomic_long_set(&n->nr_slabs, 0);
35862 @@ -3640,6 +3705,10 @@
35865 #ifdef CONFIG_SLUB_DEBUG
35866 +#ifdef CONFIG_PREEMPT_RT_BASE
35867 + /* XXX move out of irq-off section */
35868 + slab_err(s, page, text, s->name);
35870 void *addr = page_address(page);
35872 unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
35873 @@ -3660,6 +3729,7 @@
35881 @@ -3673,7 +3743,7 @@
35882 struct page *page, *h;
35884 BUG_ON(irqs_disabled());
35885 - spin_lock_irq(&n->list_lock);
35886 + raw_spin_lock_irq(&n->list_lock);
35887 list_for_each_entry_safe(page, h, &n->partial, lru) {
35888 if (!page->inuse) {
35889 remove_partial(n, page);
35890 @@ -3683,7 +3753,7 @@
35891 "Objects remaining in %s on __kmem_cache_shutdown()");
35894 - spin_unlock_irq(&n->list_lock);
35895 + raw_spin_unlock_irq(&n->list_lock);
35897 list_for_each_entry_safe(page, h, &discard, lru)
35898 discard_slab(s, page);
35899 @@ -3927,7 +3997,7 @@
35900 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
35901 INIT_LIST_HEAD(promote + i);
35903 - spin_lock_irqsave(&n->list_lock, flags);
35904 + raw_spin_lock_irqsave(&n->list_lock, flags);
35907 * Build lists of slabs to discard or promote.
35908 @@ -3958,7 +4028,7 @@
35909 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
35910 list_splice(promote + i, &n->partial);
35912 - spin_unlock_irqrestore(&n->list_lock, flags);
35913 + raw_spin_unlock_irqrestore(&n->list_lock, flags);
35915 /* Release empty slabs */
35916 list_for_each_entry_safe(page, t, &discard, lru)
35917 @@ -4171,6 +4241,12 @@
35919 static __initdata struct kmem_cache boot_kmem_cache,
35920 boot_kmem_cache_node;
35923 + for_each_possible_cpu(cpu) {
35924 + raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
35925 + INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
35928 if (debug_guardpage_minorder())
35929 slub_max_order = 0;
35930 @@ -4379,7 +4455,7 @@
35932 unsigned long flags;
35934 - spin_lock_irqsave(&n->list_lock, flags);
35935 + raw_spin_lock_irqsave(&n->list_lock, flags);
35937 list_for_each_entry(page, &n->partial, lru) {
35938 validate_slab_slab(s, page, map);
35939 @@ -4401,7 +4477,7 @@
35940 s->name, count, atomic_long_read(&n->nr_slabs));
35943 - spin_unlock_irqrestore(&n->list_lock, flags);
35944 + raw_spin_unlock_irqrestore(&n->list_lock, flags);
35948 @@ -4589,12 +4665,12 @@
35949 if (!atomic_long_read(&n->nr_slabs))
35952 - spin_lock_irqsave(&n->list_lock, flags);
35953 + raw_spin_lock_irqsave(&n->list_lock, flags);
35954 list_for_each_entry(page, &n->partial, lru)
35955 process_slab(&t, s, page, alloc, map);
35956 list_for_each_entry(page, &n->full, lru)
35957 process_slab(&t, s, page, alloc, map);
35958 - spin_unlock_irqrestore(&n->list_lock, flags);
35959 + raw_spin_unlock_irqrestore(&n->list_lock, flags);
35962 for (i = 0; i < t.count; i++) {
35963 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/swap.c linux-4.14/mm/swap.c
35964 --- linux-4.14.orig/mm/swap.c 2017-11-12 19:46:13.000000000 +0100
35965 +++ linux-4.14/mm/swap.c 2018-09-05 11:05:07.000000000 +0200
35967 #include <linux/memcontrol.h>
35968 #include <linux/gfp.h>
35969 #include <linux/uio.h>
35970 +#include <linux/locallock.h>
35971 #include <linux/hugetlb.h>
35972 #include <linux/page_idle.h>
35976 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
35978 +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
35979 +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
35982 * This path almost never happens for VM activity - pages are normally
35983 @@ -252,11 +255,11 @@
35984 unsigned long flags;
35987 - local_irq_save(flags);
35988 + local_lock_irqsave(rotate_lock, flags);
35989 pvec = this_cpu_ptr(&lru_rotate_pvecs);
35990 if (!pagevec_add(pvec, page) || PageCompound(page))
35991 pagevec_move_tail(pvec);
35992 - local_irq_restore(flags);
35993 + local_unlock_irqrestore(rotate_lock, flags);
35997 @@ -306,12 +309,13 @@
35999 page = compound_head(page);
36000 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
36001 - struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
36002 + struct pagevec *pvec = &get_locked_var(swapvec_lock,
36003 + activate_page_pvecs);
36006 if (!pagevec_add(pvec, page) || PageCompound(page))
36007 pagevec_lru_move_fn(pvec, __activate_page, NULL);
36008 - put_cpu_var(activate_page_pvecs);
36009 + put_locked_var(swapvec_lock, activate_page_pvecs);
36013 @@ -338,7 +342,7 @@
36015 static void __lru_cache_activate_page(struct page *page)
36017 - struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
36018 + struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
36022 @@ -360,7 +364,7 @@
36026 - put_cpu_var(lru_add_pvec);
36027 + put_locked_var(swapvec_lock, lru_add_pvec);
36031 @@ -402,12 +406,12 @@
36033 static void __lru_cache_add(struct page *page)
36035 - struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
36036 + struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
36039 if (!pagevec_add(pvec, page) || PageCompound(page))
36040 __pagevec_lru_add(pvec);
36041 - put_cpu_var(lru_add_pvec);
36042 + put_locked_var(swapvec_lock, lru_add_pvec);
36046 @@ -613,9 +617,15 @@
36047 unsigned long flags;
36049 /* No harm done if a racing interrupt already did this */
36050 - local_irq_save(flags);
36051 +#ifdef CONFIG_PREEMPT_RT_BASE
36052 + local_lock_irqsave_on(rotate_lock, flags, cpu);
36053 pagevec_move_tail(pvec);
36054 - local_irq_restore(flags);
36055 + local_unlock_irqrestore_on(rotate_lock, flags, cpu);
36057 + local_lock_irqsave(rotate_lock, flags);
36058 + pagevec_move_tail(pvec);
36059 + local_unlock_irqrestore(rotate_lock, flags);
36063 pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
36064 @@ -647,11 +657,12 @@
36067 if (likely(get_page_unless_zero(page))) {
36068 - struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
36069 + struct pagevec *pvec = &get_locked_var(swapvec_lock,
36070 + lru_deactivate_file_pvecs);
36072 if (!pagevec_add(pvec, page) || PageCompound(page))
36073 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
36074 - put_cpu_var(lru_deactivate_file_pvecs);
36075 + put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
36079 @@ -666,21 +677,32 @@
36081 if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
36082 !PageSwapCache(page) && !PageUnevictable(page)) {
36083 - struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
36084 + struct pagevec *pvec = &get_locked_var(swapvec_lock,
36085 + lru_lazyfree_pvecs);
36088 if (!pagevec_add(pvec, page) || PageCompound(page))
36089 pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
36090 - put_cpu_var(lru_lazyfree_pvecs);
36091 + put_locked_var(swapvec_lock, lru_lazyfree_pvecs);
36095 void lru_add_drain(void)
36097 - lru_add_drain_cpu(get_cpu());
36099 + lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
36100 + local_unlock_cpu(swapvec_lock);
36103 +#ifdef CONFIG_PREEMPT_RT_BASE
36104 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
36106 + local_lock_on(swapvec_lock, cpu);
36107 + lru_add_drain_cpu(cpu);
36108 + local_unlock_on(swapvec_lock, cpu);
36113 static void lru_add_drain_per_cpu(struct work_struct *dummy)
36116 @@ -688,6 +710,16 @@
36118 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
36120 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
36122 + struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
36124 + INIT_WORK(work, lru_add_drain_per_cpu);
36125 + queue_work_on(cpu, mm_percpu_wq, work);
36126 + cpumask_set_cpu(cpu, has_work);
36130 void lru_add_drain_all_cpuslocked(void)
36132 static DEFINE_MUTEX(lock);
36133 @@ -705,21 +737,19 @@
36134 cpumask_clear(&has_work);
36136 for_each_online_cpu(cpu) {
36137 - struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
36139 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
36140 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
36141 pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
36142 pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
36143 - need_activate_page_drain(cpu)) {
36144 - INIT_WORK(work, lru_add_drain_per_cpu);
36145 - queue_work_on(cpu, mm_percpu_wq, work);
36146 - cpumask_set_cpu(cpu, &has_work);
36148 + need_activate_page_drain(cpu))
36149 + remote_lru_add_drain(cpu, &has_work);
36152 +#ifndef CONFIG_PREEMPT_RT_BASE
36153 for_each_cpu(cpu, &has_work)
36154 flush_work(&per_cpu(lru_add_drain_work, cpu));
36157 mutex_unlock(&lock);
36159 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/truncate.c linux-4.14/mm/truncate.c
36160 --- linux-4.14.orig/mm/truncate.c 2017-11-12 19:46:13.000000000 +0100
36161 +++ linux-4.14/mm/truncate.c 2018-09-05 11:05:07.000000000 +0200
36164 if (*slot != entry)
36166 + local_lock(shadow_nodes_lock);
36167 __radix_tree_replace(&mapping->page_tree, node, slot, NULL,
36168 - workingset_update_node, mapping);
36169 + __workingset_update_node, mapping);
36170 + local_unlock(shadow_nodes_lock);
36171 mapping->nrexceptional--;
36173 spin_unlock_irq(&mapping->tree_lock);
36174 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/vmalloc.c linux-4.14/mm/vmalloc.c
36175 --- linux-4.14.orig/mm/vmalloc.c 2018-09-05 11:03:25.000000000 +0200
36176 +++ linux-4.14/mm/vmalloc.c 2018-09-05 11:05:07.000000000 +0200
36177 @@ -865,7 +865,7 @@
36178 struct vmap_block *vb;
36179 struct vmap_area *va;
36180 unsigned long vb_idx;
36182 + int node, err, cpu;
36185 node = numa_node_id();
36186 @@ -908,11 +908,12 @@
36188 radix_tree_preload_end();
36190 - vbq = &get_cpu_var(vmap_block_queue);
36191 + cpu = get_cpu_light();
36192 + vbq = this_cpu_ptr(&vmap_block_queue);
36193 spin_lock(&vbq->lock);
36194 list_add_tail_rcu(&vb->free_list, &vbq->free);
36195 spin_unlock(&vbq->lock);
36196 - put_cpu_var(vmap_block_queue);
36201 @@ -981,6 +982,7 @@
36202 struct vmap_block *vb;
36203 void *vaddr = NULL;
36204 unsigned int order;
36207 BUG_ON(offset_in_page(size));
36208 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
36209 @@ -995,7 +997,8 @@
36210 order = get_order(size);
36213 - vbq = &get_cpu_var(vmap_block_queue);
36214 + cpu = get_cpu_light();
36215 + vbq = this_cpu_ptr(&vmap_block_queue);
36216 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
36217 unsigned long pages_off;
36219 @@ -1018,7 +1021,7 @@
36223 - put_cpu_var(vmap_block_queue);
36227 /* Allocate new block if nothing was found */
36228 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/vmstat.c linux-4.14/mm/vmstat.c
36229 --- linux-4.14.orig/mm/vmstat.c 2017-11-12 19:46:13.000000000 +0100
36230 +++ linux-4.14/mm/vmstat.c 2018-09-05 11:05:07.000000000 +0200
36231 @@ -249,6 +249,7 @@
36235 + preempt_disable_rt();
36236 x = delta + __this_cpu_read(*p);
36238 t = __this_cpu_read(pcp->stat_threshold);
36239 @@ -258,6 +259,7 @@
36242 __this_cpu_write(*p, x);
36243 + preempt_enable_rt();
36245 EXPORT_SYMBOL(__mod_zone_page_state);
36247 @@ -269,6 +271,7 @@
36251 + preempt_disable_rt();
36252 x = delta + __this_cpu_read(*p);
36254 t = __this_cpu_read(pcp->stat_threshold);
36255 @@ -278,6 +281,7 @@
36258 __this_cpu_write(*p, x);
36259 + preempt_enable_rt();
36261 EXPORT_SYMBOL(__mod_node_page_state);
36263 @@ -310,6 +314,7 @@
36264 s8 __percpu *p = pcp->vm_stat_diff + item;
36267 + preempt_disable_rt();
36268 v = __this_cpu_inc_return(*p);
36269 t = __this_cpu_read(pcp->stat_threshold);
36270 if (unlikely(v > t)) {
36271 @@ -318,6 +323,7 @@
36272 zone_page_state_add(v + overstep, zone, item);
36273 __this_cpu_write(*p, -overstep);
36275 + preempt_enable_rt();
36278 void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
36279 @@ -326,6 +332,7 @@
36280 s8 __percpu *p = pcp->vm_node_stat_diff + item;
36283 + preempt_disable_rt();
36284 v = __this_cpu_inc_return(*p);
36285 t = __this_cpu_read(pcp->stat_threshold);
36286 if (unlikely(v > t)) {
36287 @@ -334,6 +341,7 @@
36288 node_page_state_add(v + overstep, pgdat, item);
36289 __this_cpu_write(*p, -overstep);
36291 + preempt_enable_rt();
36294 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
36295 @@ -354,6 +362,7 @@
36296 s8 __percpu *p = pcp->vm_stat_diff + item;
36299 + preempt_disable_rt();
36300 v = __this_cpu_dec_return(*p);
36301 t = __this_cpu_read(pcp->stat_threshold);
36302 if (unlikely(v < - t)) {
36303 @@ -362,6 +371,7 @@
36304 zone_page_state_add(v - overstep, zone, item);
36305 __this_cpu_write(*p, overstep);
36307 + preempt_enable_rt();
36310 void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
36311 @@ -370,6 +380,7 @@
36312 s8 __percpu *p = pcp->vm_node_stat_diff + item;
36315 + preempt_disable_rt();
36316 v = __this_cpu_dec_return(*p);
36317 t = __this_cpu_read(pcp->stat_threshold);
36318 if (unlikely(v < - t)) {
36319 @@ -378,6 +389,7 @@
36320 node_page_state_add(v - overstep, pgdat, item);
36321 __this_cpu_write(*p, overstep);
36323 + preempt_enable_rt();
36326 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
36327 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/workingset.c linux-4.14/mm/workingset.c
36328 --- linux-4.14.orig/mm/workingset.c 2017-11-12 19:46:13.000000000 +0100
36329 +++ linux-4.14/mm/workingset.c 2018-09-05 11:05:07.000000000 +0200
36330 @@ -338,9 +338,10 @@
36331 * point where they would still be useful.
36334 -static struct list_lru shadow_nodes;
36335 +static struct list_lru __shadow_nodes;
36336 +DEFINE_LOCAL_IRQ_LOCK(shadow_nodes_lock);
36338 -void workingset_update_node(struct radix_tree_node *node, void *private)
36339 +void __workingset_update_node(struct radix_tree_node *node, void *private)
36341 struct address_space *mapping = private;
36343 @@ -358,10 +359,10 @@
36345 if (node->count && node->count == node->exceptional) {
36346 if (list_empty(&node->private_list))
36347 - list_lru_add(&shadow_nodes, &node->private_list);
36348 + list_lru_add(&__shadow_nodes, &node->private_list);
36350 if (!list_empty(&node->private_list))
36351 - list_lru_del(&shadow_nodes, &node->private_list);
36352 + list_lru_del(&__shadow_nodes, &node->private_list);
36356 @@ -373,9 +374,9 @@
36357 unsigned long cache;
36359 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
36360 - local_irq_disable();
36361 - nodes = list_lru_shrink_count(&shadow_nodes, sc);
36362 - local_irq_enable();
36363 + local_lock_irq(shadow_nodes_lock);
36364 + nodes = list_lru_shrink_count(&__shadow_nodes, sc);
36365 + local_unlock_irq(shadow_nodes_lock);
36368 * Approximate a reasonable limit for the radix tree nodes
36369 @@ -475,15 +476,15 @@
36371 inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
36372 __radix_tree_delete_node(&mapping->page_tree, node,
36373 - workingset_update_node, mapping);
36374 + __workingset_update_node, mapping);
36377 spin_unlock(&mapping->tree_lock);
36378 ret = LRU_REMOVED_RETRY;
36380 - local_irq_enable();
36381 + local_unlock_irq(shadow_nodes_lock);
36383 - local_irq_disable();
36384 + local_lock_irq(shadow_nodes_lock);
36385 spin_lock(lru_lock);
36388 @@ -494,9 +495,9 @@
36391 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
36392 - local_irq_disable();
36393 - ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
36394 - local_irq_enable();
36395 + local_lock_irq(shadow_nodes_lock);
36396 + ret = list_lru_shrink_walk(&__shadow_nodes, sc, shadow_lru_isolate, NULL);
36397 + local_unlock_irq(shadow_nodes_lock);
36401 @@ -534,7 +535,7 @@
36402 pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
36403 timestamp_bits, max_order, bucket_order);
36405 - ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key);
36406 + ret = __list_lru_init(&__shadow_nodes, true, &shadow_nodes_key);
36409 ret = register_shrinker(&workingset_shadow_shrinker);
36410 @@ -542,7 +543,7 @@
36414 - list_lru_destroy(&shadow_nodes);
36415 + list_lru_destroy(&__shadow_nodes);
36419 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/zsmalloc.c linux-4.14/mm/zsmalloc.c
36420 --- linux-4.14.orig/mm/zsmalloc.c 2018-09-05 11:03:25.000000000 +0200
36421 +++ linux-4.14/mm/zsmalloc.c 2018-09-05 11:05:07.000000000 +0200
36423 #include <linux/mount.h>
36424 #include <linux/migrate.h>
36425 #include <linux/pagemap.h>
36426 +#include <linux/locallock.h>
36428 #define ZSPAGE_MAGIC 0x58
36432 #define ZS_MAX_ZSPAGE_ORDER 2
36433 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
36435 #define ZS_HANDLE_SIZE (sizeof(unsigned long))
36437 +#ifdef CONFIG_PREEMPT_RT_FULL
36439 +struct zsmalloc_handle {
36440 + unsigned long addr;
36441 + struct mutex lock;
36444 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
36448 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
36452 * Object location (<PFN>, <obj_idx>) is encoded as
36453 * as single (unsigned long) handle value.
36454 @@ -320,7 +334,7 @@
36456 static int create_cache(struct zs_pool *pool)
36458 - pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
36459 + pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
36461 if (!pool->handle_cachep)
36463 @@ -344,9 +358,26 @@
36465 static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
36467 - return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
36468 - gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
36471 + p = kmem_cache_alloc(pool->handle_cachep,
36472 + gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
36473 +#ifdef CONFIG_PREEMPT_RT_FULL
36475 + struct zsmalloc_handle *zh = p;
36477 + mutex_init(&zh->lock);
36480 + return (unsigned long)p;
36483 +#ifdef CONFIG_PREEMPT_RT_FULL
36484 +static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
36486 + return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
36490 static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
36492 @@ -366,12 +397,18 @@
36494 static void record_obj(unsigned long handle, unsigned long obj)
36496 +#ifdef CONFIG_PREEMPT_RT_FULL
36497 + struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
36499 + WRITE_ONCE(zh->addr, obj);
36502 * lsb of @obj represents handle lock while other bits
36503 * represent object value the handle is pointing so
36504 * updating shouldn't do store tearing.
36506 WRITE_ONCE(*(unsigned long *)handle, obj);
36511 @@ -460,6 +497,7 @@
36513 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
36514 static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
36515 +static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
36517 static bool is_zspage_isolated(struct zspage *zspage)
36519 @@ -898,7 +936,13 @@
36521 static unsigned long handle_to_obj(unsigned long handle)
36523 +#ifdef CONFIG_PREEMPT_RT_FULL
36524 + struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
36528 return *(unsigned long *)handle;
36532 static unsigned long obj_to_head(struct page *page, void *obj)
36533 @@ -912,22 +956,46 @@
36535 static inline int testpin_tag(unsigned long handle)
36537 +#ifdef CONFIG_PREEMPT_RT_FULL
36538 + struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
36540 + return mutex_is_locked(&zh->lock);
36542 return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
36546 static inline int trypin_tag(unsigned long handle)
36548 +#ifdef CONFIG_PREEMPT_RT_FULL
36549 + struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
36551 + return mutex_trylock(&zh->lock);
36553 return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
36557 static void pin_tag(unsigned long handle)
36559 +#ifdef CONFIG_PREEMPT_RT_FULL
36560 + struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
36562 + return mutex_lock(&zh->lock);
36564 bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
36568 static void unpin_tag(unsigned long handle)
36570 +#ifdef CONFIG_PREEMPT_RT_FULL
36571 + struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
36573 + return mutex_unlock(&zh->lock);
36575 bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
36579 static void reset_page(struct page *page)
36580 @@ -1365,7 +1433,7 @@
36581 class = pool->size_class[class_idx];
36582 off = (class->size * obj_idx) & ~PAGE_MASK;
36584 - area = &get_cpu_var(zs_map_area);
36585 + area = &get_locked_var(zs_map_area_lock, zs_map_area);
36587 if (off + class->size <= PAGE_SIZE) {
36588 /* this object is contained entirely within a page */
36589 @@ -1419,7 +1487,7 @@
36591 __zs_unmap_object(area, pages, off, class->size);
36593 - put_cpu_var(zs_map_area);
36594 + put_locked_var(zs_map_area_lock, zs_map_area);
36596 migrate_read_unlock(zspage);
36598 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/9p/trans_xen.c linux-4.14/net/9p/trans_xen.c
36599 --- linux-4.14.orig/net/9p/trans_xen.c 2018-09-05 11:03:25.000000000 +0200
36600 +++ linux-4.14/net/9p/trans_xen.c 2018-09-05 11:05:07.000000000 +0200
36603 #include <linux/module.h>
36604 #include <linux/spinlock.h>
36605 -#include <linux/rwlock.h>
36606 #include <net/9p/9p.h>
36607 #include <net/9p/client.h>
36608 #include <net/9p/transport.h>
36609 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/bluetooth/hci_sock.c linux-4.14/net/bluetooth/hci_sock.c
36610 --- linux-4.14.orig/net/bluetooth/hci_sock.c 2017-11-12 19:46:13.000000000 +0100
36611 +++ linux-4.14/net/bluetooth/hci_sock.c 2018-09-05 11:05:07.000000000 +0200
36612 @@ -251,15 +251,13 @@
36615 /* Send frame to sockets with specific channel */
36616 -void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
36617 - int flag, struct sock *skip_sk)
36618 +static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
36619 + int flag, struct sock *skip_sk)
36623 BT_DBG("channel %u len %d", channel, skb->len);
36625 - read_lock(&hci_sk_list.lock);
36627 sk_for_each(sk, &hci_sk_list.head) {
36628 struct sk_buff *nskb;
36630 @@ -285,6 +283,13 @@
36636 +void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
36637 + int flag, struct sock *skip_sk)
36639 + read_lock(&hci_sk_list.lock);
36640 + __hci_send_to_channel(channel, skb, flag, skip_sk);
36641 read_unlock(&hci_sk_list.lock);
36644 @@ -388,8 +393,8 @@
36645 hdr->index = index;
36646 hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
36648 - hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
36649 - HCI_SOCK_TRUSTED, NULL);
36650 + __hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
36651 + HCI_SOCK_TRUSTED, NULL);
36655 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/can/bcm.c linux-4.14/net/can/bcm.c
36656 --- linux-4.14.orig/net/can/bcm.c 2017-11-12 19:46:13.000000000 +0100
36657 +++ linux-4.14/net/can/bcm.c 2018-09-05 11:05:07.000000000 +0200
36658 @@ -102,7 +102,6 @@
36659 unsigned long frames_abs, frames_filtered;
36660 struct bcm_timeval ival1, ival2;
36661 struct hrtimer timer, thrtimer;
36662 - struct tasklet_struct tsklet, thrtsklet;
36663 ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg;
36666 @@ -364,25 +363,34 @@
36670 -static void bcm_tx_start_timer(struct bcm_op *op)
36671 +static bool bcm_tx_set_expiry(struct bcm_op *op, struct hrtimer *hrt)
36675 if (op->kt_ival1 && op->count)
36676 - hrtimer_start(&op->timer,
36677 - ktime_add(ktime_get(), op->kt_ival1),
36678 - HRTIMER_MODE_ABS);
36679 + ival = op->kt_ival1;
36680 else if (op->kt_ival2)
36681 - hrtimer_start(&op->timer,
36682 - ktime_add(ktime_get(), op->kt_ival2),
36683 - HRTIMER_MODE_ABS);
36684 + ival = op->kt_ival2;
36688 + hrtimer_set_expires(hrt, ktime_add(ktime_get(), ival));
36692 -static void bcm_tx_timeout_tsklet(unsigned long data)
36693 +static void bcm_tx_start_timer(struct bcm_op *op)
36695 - struct bcm_op *op = (struct bcm_op *)data;
36696 + if (bcm_tx_set_expiry(op, &op->timer))
36697 + hrtimer_start_expires(&op->timer, HRTIMER_MODE_ABS_SOFT);
36700 +/* bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */
36701 +static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
36703 + struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
36704 struct bcm_msg_head msg_head;
36706 if (op->kt_ival1 && (op->count > 0)) {
36709 if (!op->count && (op->flags & TX_COUNTEVT)) {
36711 @@ -399,22 +407,12 @@
36715 - } else if (op->kt_ival2)
36716 + } else if (op->kt_ival2) {
36720 - bcm_tx_start_timer(op);
36724 - * bcm_tx_timeout_handler - performs cyclic CAN frame transmissions
36726 -static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
36728 - struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
36730 - tasklet_schedule(&op->tsklet);
36732 - return HRTIMER_NORESTART;
36733 + return bcm_tx_set_expiry(op, &op->timer) ?
36734 + HRTIMER_RESTART : HRTIMER_NORESTART;
36738 @@ -480,7 +478,7 @@
36739 /* do not send the saved data - only start throttle timer */
36740 hrtimer_start(&op->thrtimer,
36741 ktime_add(op->kt_lastmsg, op->kt_ival2),
36742 - HRTIMER_MODE_ABS);
36743 + HRTIMER_MODE_ABS_SOFT);
36747 @@ -539,14 +537,21 @@
36751 - hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL);
36752 + hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT);
36755 -static void bcm_rx_timeout_tsklet(unsigned long data)
36756 +/* bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */
36757 +static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
36759 - struct bcm_op *op = (struct bcm_op *)data;
36760 + struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
36761 struct bcm_msg_head msg_head;
36763 + /* if user wants to be informed, when cyclic CAN-Messages come back */
36764 + if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
36765 + /* clear received CAN frames to indicate 'nothing received' */
36766 + memset(op->last_frames, 0, op->nframes * op->cfsiz);
36769 /* create notification to user */
36770 msg_head.opcode = RX_TIMEOUT;
36771 msg_head.flags = op->flags;
36772 @@ -557,25 +562,6 @@
36773 msg_head.nframes = 0;
36775 bcm_send_to_user(op, &msg_head, NULL, 0);
36779 - * bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out
36781 -static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
36783 - struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
36785 - /* schedule before NET_RX_SOFTIRQ */
36786 - tasklet_hi_schedule(&op->tsklet);
36788 - /* no restart of the timer is done here! */
36790 - /* if user wants to be informed, when cyclic CAN-Messages come back */
36791 - if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
36792 - /* clear received CAN frames to indicate 'nothing received' */
36793 - memset(op->last_frames, 0, op->nframes * op->cfsiz);
36796 return HRTIMER_NORESTART;
36798 @@ -583,14 +569,12 @@
36800 * bcm_rx_do_flush - helper for bcm_rx_thr_flush
36802 -static inline int bcm_rx_do_flush(struct bcm_op *op, int update,
36803 - unsigned int index)
36804 +static inline int bcm_rx_do_flush(struct bcm_op *op, unsigned int index)
36806 struct canfd_frame *lcf = op->last_frames + op->cfsiz * index;
36808 if ((op->last_frames) && (lcf->flags & RX_THR)) {
36810 - bcm_rx_changed(op, lcf);
36811 + bcm_rx_changed(op, lcf);
36815 @@ -598,11 +582,8 @@
36818 * bcm_rx_thr_flush - Check for throttled data and send it to the userspace
36820 - * update == 0 : just check if throttled data is available (any irq context)
36821 - * update == 1 : check and send throttled data to userspace (soft_irq context)
36823 -static int bcm_rx_thr_flush(struct bcm_op *op, int update)
36824 +static int bcm_rx_thr_flush(struct bcm_op *op)
36828 @@ -611,24 +592,16 @@
36830 /* for MUX filter we start at index 1 */
36831 for (i = 1; i < op->nframes; i++)
36832 - updated += bcm_rx_do_flush(op, update, i);
36833 + updated += bcm_rx_do_flush(op, i);
36836 /* for RX_FILTER_ID and simple filter */
36837 - updated += bcm_rx_do_flush(op, update, 0);
36838 + updated += bcm_rx_do_flush(op, 0);
36844 -static void bcm_rx_thr_tsklet(unsigned long data)
36846 - struct bcm_op *op = (struct bcm_op *)data;
36848 - /* push the changed data to the userspace */
36849 - bcm_rx_thr_flush(op, 1);
36853 * bcm_rx_thr_handler - the time for blocked content updates is over now:
36854 * Check for throttled data and send it to the userspace
36855 @@ -637,9 +610,7 @@
36857 struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer);
36859 - tasklet_schedule(&op->thrtsklet);
36861 - if (bcm_rx_thr_flush(op, 0)) {
36862 + if (bcm_rx_thr_flush(op)) {
36863 hrtimer_forward(hrtimer, ktime_get(), op->kt_ival2);
36864 return HRTIMER_RESTART;
36866 @@ -735,23 +706,8 @@
36868 static void bcm_remove_op(struct bcm_op *op)
36870 - if (op->tsklet.func) {
36871 - while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) ||
36872 - test_bit(TASKLET_STATE_RUN, &op->tsklet.state) ||
36873 - hrtimer_active(&op->timer)) {
36874 - hrtimer_cancel(&op->timer);
36875 - tasklet_kill(&op->tsklet);
36879 - if (op->thrtsklet.func) {
36880 - while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) ||
36881 - test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) ||
36882 - hrtimer_active(&op->thrtimer)) {
36883 - hrtimer_cancel(&op->thrtimer);
36884 - tasklet_kill(&op->thrtsklet);
36887 + hrtimer_cancel(&op->timer);
36888 + hrtimer_cancel(&op->thrtimer);
36890 if ((op->frames) && (op->frames != &op->sframe))
36892 @@ -979,15 +935,13 @@
36893 op->ifindex = ifindex;
36895 /* initialize uninitialized (kzalloc) structure */
36896 - hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
36897 + hrtimer_init(&op->timer, CLOCK_MONOTONIC,
36898 + HRTIMER_MODE_REL_SOFT);
36899 op->timer.function = bcm_tx_timeout_handler;
36901 - /* initialize tasklet for tx countevent notification */
36902 - tasklet_init(&op->tsklet, bcm_tx_timeout_tsklet,
36903 - (unsigned long) op);
36905 /* currently unused in tx_ops */
36906 - hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
36907 + hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC,
36908 + HRTIMER_MODE_REL_SOFT);
36910 /* add this bcm_op to the list of the tx_ops */
36911 list_add(&op->list, &bo->tx_ops);
36912 @@ -1150,20 +1104,14 @@
36913 op->rx_ifindex = ifindex;
36915 /* initialize uninitialized (kzalloc) structure */
36916 - hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
36917 + hrtimer_init(&op->timer, CLOCK_MONOTONIC,
36918 + HRTIMER_MODE_REL_SOFT);
36919 op->timer.function = bcm_rx_timeout_handler;
36921 - /* initialize tasklet for rx timeout notification */
36922 - tasklet_init(&op->tsklet, bcm_rx_timeout_tsklet,
36923 - (unsigned long) op);
36925 - hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
36926 + hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC,
36927 + HRTIMER_MODE_REL_SOFT);
36928 op->thrtimer.function = bcm_rx_thr_handler;
36930 - /* initialize tasklet for rx throttle handling */
36931 - tasklet_init(&op->thrtsklet, bcm_rx_thr_tsklet,
36932 - (unsigned long) op);
36934 /* add this bcm_op to the list of the rx_ops */
36935 list_add(&op->list, &bo->rx_ops);
36937 @@ -1209,12 +1157,12 @@
36939 op->kt_lastmsg = 0;
36940 hrtimer_cancel(&op->thrtimer);
36941 - bcm_rx_thr_flush(op, 1);
36942 + bcm_rx_thr_flush(op);
36945 if ((op->flags & STARTTIMER) && op->kt_ival1)
36946 hrtimer_start(&op->timer, op->kt_ival1,
36947 - HRTIMER_MODE_REL);
36948 + HRTIMER_MODE_REL_SOFT);
36951 /* now we can register for can_ids, if we added a new bcm_op */
36952 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/dev.c linux-4.14/net/core/dev.c
36953 --- linux-4.14.orig/net/core/dev.c 2018-09-05 11:03:25.000000000 +0200
36954 +++ linux-4.14/net/core/dev.c 2018-09-05 11:05:07.000000000 +0200
36955 @@ -195,6 +195,7 @@
36956 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
36958 static seqcount_t devnet_rename_seq;
36959 +static DEFINE_MUTEX(devnet_rename_mutex);
36961 static inline void dev_base_seq_inc(struct net *net)
36963 @@ -217,14 +218,14 @@
36964 static inline void rps_lock(struct softnet_data *sd)
36967 - spin_lock(&sd->input_pkt_queue.lock);
36968 + raw_spin_lock(&sd->input_pkt_queue.raw_lock);
36972 static inline void rps_unlock(struct softnet_data *sd)
36975 - spin_unlock(&sd->input_pkt_queue.lock);
36976 + raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
36980 @@ -920,7 +921,8 @@
36981 strcpy(name, dev->name);
36983 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
36985 + mutex_lock(&devnet_rename_mutex);
36986 + mutex_unlock(&devnet_rename_mutex);
36990 @@ -1189,20 +1191,17 @@
36991 if (dev->flags & IFF_UP)
36994 - write_seqcount_begin(&devnet_rename_seq);
36995 + mutex_lock(&devnet_rename_mutex);
36996 + __raw_write_seqcount_begin(&devnet_rename_seq);
36998 - if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
36999 - write_seqcount_end(&devnet_rename_seq);
37002 + if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
37005 memcpy(oldname, dev->name, IFNAMSIZ);
37007 err = dev_get_valid_name(net, dev, newname);
37009 - write_seqcount_end(&devnet_rename_seq);
37015 if (oldname[0] && !strchr(oldname, '%'))
37016 netdev_info(dev, "renamed from %s\n", oldname);
37017 @@ -1215,11 +1214,12 @@
37019 memcpy(dev->name, oldname, IFNAMSIZ);
37020 dev->name_assign_type = old_assign_type;
37021 - write_seqcount_end(&devnet_rename_seq);
37027 - write_seqcount_end(&devnet_rename_seq);
37028 + __raw_write_seqcount_end(&devnet_rename_seq);
37029 + mutex_unlock(&devnet_rename_mutex);
37031 netdev_adjacent_rename_links(dev, oldname);
37033 @@ -1240,7 +1240,8 @@
37034 /* err >= 0 after dev_alloc_name() or stores the first errno */
37037 - write_seqcount_begin(&devnet_rename_seq);
37038 + mutex_lock(&devnet_rename_mutex);
37039 + __raw_write_seqcount_begin(&devnet_rename_seq);
37040 memcpy(dev->name, oldname, IFNAMSIZ);
37041 memcpy(oldname, newname, IFNAMSIZ);
37042 dev->name_assign_type = old_assign_type;
37043 @@ -1253,6 +1254,11 @@
37049 + __raw_write_seqcount_end(&devnet_rename_seq);
37050 + mutex_unlock(&devnet_rename_mutex);
37055 @@ -2438,6 +2444,7 @@
37056 sd->output_queue_tailp = &q->next_sched;
37057 raise_softirq_irqoff(NET_TX_SOFTIRQ);
37058 local_irq_restore(flags);
37059 + preempt_check_resched_rt();
37062 void __netif_schedule(struct Qdisc *q)
37063 @@ -2500,6 +2507,7 @@
37064 __this_cpu_write(softnet_data.completion_queue, skb);
37065 raise_softirq_irqoff(NET_TX_SOFTIRQ);
37066 local_irq_restore(flags);
37067 + preempt_check_resched_rt();
37069 EXPORT_SYMBOL(__dev_kfree_skb_irq);
37071 @@ -3175,7 +3183,11 @@
37072 * This permits qdisc->running owner to get the lock more
37073 * often and dequeue packets faster.
37075 +#ifdef CONFIG_PREEMPT_RT_FULL
37076 + contended = true;
37078 contended = qdisc_is_running(q);
37080 if (unlikely(contended))
37081 spin_lock(&q->busylock);
37083 @@ -3246,8 +3258,10 @@
37084 #define skb_update_prio(skb)
37087 +#ifndef CONFIG_PREEMPT_RT_FULL
37088 DEFINE_PER_CPU(int, xmit_recursion);
37089 EXPORT_SYMBOL(xmit_recursion);
37093 * dev_loopback_xmit - loop back @skb
37094 @@ -3487,9 +3501,12 @@
37095 if (dev->flags & IFF_UP) {
37096 int cpu = smp_processor_id(); /* ok because BHs are off */
37098 +#ifdef CONFIG_PREEMPT_RT_FULL
37099 + if (txq->xmit_lock_owner != current) {
37101 if (txq->xmit_lock_owner != cpu) {
37102 - if (unlikely(__this_cpu_read(xmit_recursion) >
37103 - XMIT_RECURSION_LIMIT))
37105 + if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT))
37106 goto recursion_alert;
37108 skb = validate_xmit_skb(skb, dev);
37109 @@ -3499,9 +3516,9 @@
37110 HARD_TX_LOCK(dev, txq, cpu);
37112 if (!netif_xmit_stopped(txq)) {
37113 - __this_cpu_inc(xmit_recursion);
37115 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
37116 - __this_cpu_dec(xmit_recursion);
37118 if (dev_xmit_complete(rc)) {
37119 HARD_TX_UNLOCK(dev, txq);
37121 @@ -3882,6 +3899,7 @@
37124 local_irq_restore(flags);
37125 + preempt_check_resched_rt();
37127 atomic_long_inc(&skb->dev->rx_dropped);
37129 @@ -4034,7 +4052,7 @@
37130 struct rps_dev_flow voidflow, *rflow = &voidflow;
37133 - preempt_disable();
37134 + migrate_disable();
37137 cpu = get_rps_cpu(skb->dev, skb, &rflow);
37138 @@ -4044,14 +4062,14 @@
37139 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
37142 - preempt_enable();
37143 + migrate_enable();
37147 unsigned int qtail;
37149 - ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
37151 + ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
37156 @@ -4085,11 +4103,9 @@
37158 trace_netif_rx_ni_entry(skb);
37160 - preempt_disable();
37161 + local_bh_disable();
37162 err = netif_rx_internal(skb);
37163 - if (local_softirq_pending())
37165 - preempt_enable();
37166 + local_bh_enable();
37170 @@ -4607,7 +4623,7 @@
37171 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
37172 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
37173 __skb_unlink(skb, &sd->input_pkt_queue);
37175 + __skb_queue_tail(&sd->tofree_queue, skb);
37176 input_queue_head_incr(sd);
37179 @@ -4617,11 +4633,14 @@
37180 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
37181 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
37182 __skb_unlink(skb, &sd->process_queue);
37184 + __skb_queue_tail(&sd->tofree_queue, skb);
37185 input_queue_head_incr(sd);
37188 + if (!skb_queue_empty(&sd->tofree_queue))
37189 + raise_softirq_irqoff(NET_RX_SOFTIRQ);
37194 static void flush_all_backlogs(void)
37195 @@ -5131,12 +5150,14 @@
37196 sd->rps_ipi_list = NULL;
37198 local_irq_enable();
37199 + preempt_check_resched_rt();
37201 /* Send pending IPI's to kick RPS processing on remote cpus. */
37202 net_rps_send_ipi(remsd);
37205 local_irq_enable();
37206 + preempt_check_resched_rt();
37209 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
37210 @@ -5166,7 +5187,9 @@
37212 struct sk_buff *skb;
37214 + local_irq_disable();
37215 while ((skb = __skb_dequeue(&sd->process_queue))) {
37216 + local_irq_enable();
37218 __netif_receive_skb(skb);
37220 @@ -5174,9 +5197,9 @@
37221 if (++work >= quota)
37224 + local_irq_disable();
37227 - local_irq_disable();
37229 if (skb_queue_empty(&sd->input_pkt_queue)) {
37231 @@ -5214,6 +5237,7 @@
37232 local_irq_save(flags);
37233 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
37234 local_irq_restore(flags);
37235 + preempt_check_resched_rt();
37237 EXPORT_SYMBOL(__napi_schedule);
37239 @@ -5250,6 +5274,7 @@
37241 EXPORT_SYMBOL(napi_schedule_prep);
37243 +#ifndef CONFIG_PREEMPT_RT_FULL
37245 * __napi_schedule_irqoff - schedule for receive
37246 * @n: entry to schedule
37247 @@ -5261,6 +5286,7 @@
37248 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
37250 EXPORT_SYMBOL(__napi_schedule_irqoff);
37253 bool napi_complete_done(struct napi_struct *n, int work_done)
37255 @@ -5615,13 +5641,21 @@
37256 unsigned long time_limit = jiffies +
37257 usecs_to_jiffies(netdev_budget_usecs);
37258 int budget = netdev_budget;
37259 + struct sk_buff_head tofree_q;
37260 + struct sk_buff *skb;
37264 + __skb_queue_head_init(&tofree_q);
37266 local_irq_disable();
37267 + skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
37268 list_splice_init(&sd->poll_list, &list);
37269 local_irq_enable();
37271 + while ((skb = __skb_dequeue(&tofree_q)))
37275 struct napi_struct *n;
37277 @@ -5651,7 +5685,7 @@
37278 list_splice_tail(&repoll, &list);
37279 list_splice(&list, &sd->poll_list);
37280 if (!list_empty(&sd->poll_list))
37281 - __raise_softirq_irqoff(NET_RX_SOFTIRQ);
37282 + __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
37284 net_rps_action_and_irq_enable(sd);
37286 @@ -7478,7 +7512,7 @@
37287 /* Initialize queue lock */
37288 spin_lock_init(&queue->_xmit_lock);
37289 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
37290 - queue->xmit_lock_owner = -1;
37291 + netdev_queue_clear_owner(queue);
37292 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
37295 @@ -8418,6 +8452,7 @@
37297 raise_softirq_irqoff(NET_TX_SOFTIRQ);
37298 local_irq_enable();
37299 + preempt_check_resched_rt();
37302 remsd = oldsd->rps_ipi_list;
37303 @@ -8431,10 +8466,13 @@
37305 input_queue_head_incr(oldsd);
37307 - while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
37308 + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
37310 input_queue_head_incr(oldsd);
37312 + while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
37318 @@ -8738,8 +8776,9 @@
37320 INIT_WORK(flush, flush_backlog);
37322 - skb_queue_head_init(&sd->input_pkt_queue);
37323 - skb_queue_head_init(&sd->process_queue);
37324 + skb_queue_head_init_raw(&sd->input_pkt_queue);
37325 + skb_queue_head_init_raw(&sd->process_queue);
37326 + skb_queue_head_init_raw(&sd->tofree_queue);
37327 INIT_LIST_HEAD(&sd->poll_list);
37328 sd->output_queue_tailp = &sd->output_queue;
37330 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/filter.c linux-4.14/net/core/filter.c
37331 --- linux-4.14.orig/net/core/filter.c 2018-09-05 11:03:25.000000000 +0200
37332 +++ linux-4.14/net/core/filter.c 2018-09-05 11:05:07.000000000 +0200
37333 @@ -1696,7 +1696,7 @@
37337 - if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
37338 + if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) {
37339 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
37342 @@ -1704,9 +1704,9 @@
37346 - __this_cpu_inc(xmit_recursion);
37348 ret = dev_queue_xmit(skb);
37349 - __this_cpu_dec(xmit_recursion);
37354 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/gen_estimator.c linux-4.14/net/core/gen_estimator.c
37355 --- linux-4.14.orig/net/core/gen_estimator.c 2018-09-05 11:03:25.000000000 +0200
37356 +++ linux-4.14/net/core/gen_estimator.c 2018-09-05 11:05:07.000000000 +0200
37358 struct net_rate_estimator {
37359 struct gnet_stats_basic_packed *bstats;
37360 spinlock_t *stats_lock;
37361 - seqcount_t *running;
37362 + net_seqlock_t *running;
37363 struct gnet_stats_basic_cpu __percpu *cpu_bstats;
37365 u8 intvl_log; /* period : (250ms << intvl_log) */
37366 @@ -129,7 +129,7 @@
37367 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
37368 struct net_rate_estimator __rcu **rate_est,
37369 spinlock_t *stats_lock,
37370 - seqcount_t *running,
37371 + net_seqlock_t *running,
37372 struct nlattr *opt)
37374 struct gnet_estimator *parm = nla_data(opt);
37375 @@ -222,7 +222,7 @@
37376 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
37377 struct net_rate_estimator __rcu **rate_est,
37378 spinlock_t *stats_lock,
37379 - seqcount_t *running, struct nlattr *opt)
37380 + net_seqlock_t *running, struct nlattr *opt)
37382 return gen_new_estimator(bstats, cpu_bstats, rate_est,
37383 stats_lock, running, opt);
37384 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/gen_stats.c linux-4.14/net/core/gen_stats.c
37385 --- linux-4.14.orig/net/core/gen_stats.c 2018-09-05 11:03:25.000000000 +0200
37386 +++ linux-4.14/net/core/gen_stats.c 2018-09-05 11:05:07.000000000 +0200
37387 @@ -142,7 +142,7 @@
37391 -__gnet_stats_copy_basic(const seqcount_t *running,
37392 +__gnet_stats_copy_basic(net_seqlock_t *running,
37393 struct gnet_stats_basic_packed *bstats,
37394 struct gnet_stats_basic_cpu __percpu *cpu,
37395 struct gnet_stats_basic_packed *b)
37396 @@ -155,10 +155,10 @@
37400 - seq = read_seqcount_begin(running);
37401 + seq = net_seq_begin(running);
37402 bstats->bytes = b->bytes;
37403 bstats->packets = b->packets;
37404 - } while (running && read_seqcount_retry(running, seq));
37405 + } while (running && net_seq_retry(running, seq));
37407 EXPORT_SYMBOL(__gnet_stats_copy_basic);
37409 @@ -176,7 +176,7 @@
37410 * if the room in the socket buffer was not sufficient.
37413 -gnet_stats_copy_basic(const seqcount_t *running,
37414 +gnet_stats_copy_basic(net_seqlock_t *running,
37415 struct gnet_dump *d,
37416 struct gnet_stats_basic_cpu __percpu *cpu,
37417 struct gnet_stats_basic_packed *b)
37418 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/pktgen.c linux-4.14/net/core/pktgen.c
37419 --- linux-4.14.orig/net/core/pktgen.c 2017-11-12 19:46:13.000000000 +0100
37420 +++ linux-4.14/net/core/pktgen.c 2018-09-05 11:05:07.000000000 +0200
37421 @@ -2252,7 +2252,8 @@
37423 struct hrtimer_sleeper t;
37425 - hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
37426 + hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS,
37428 hrtimer_set_expires(&t.timer, spin_until);
37430 remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
37431 @@ -2267,7 +2268,6 @@
37432 } while (ktime_compare(end_time, spin_until) < 0);
37434 /* see do_nanosleep */
37435 - hrtimer_init_sleeper(&t, current);
37437 set_current_state(TASK_INTERRUPTIBLE);
37438 hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
37439 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/skbuff.c linux-4.14/net/core/skbuff.c
37440 --- linux-4.14.orig/net/core/skbuff.c 2018-09-05 11:03:25.000000000 +0200
37441 +++ linux-4.14/net/core/skbuff.c 2018-09-05 11:05:07.000000000 +0200
37443 #include <linux/errqueue.h>
37444 #include <linux/prefetch.h>
37445 #include <linux/if_vlan.h>
37446 +#include <linux/locallock.h>
37448 #include <net/protocol.h>
37449 #include <net/dst.h>
37450 @@ -330,6 +331,8 @@
37452 static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
37453 static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
37454 +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
37455 +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
37457 static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
37459 @@ -337,10 +340,10 @@
37460 unsigned long flags;
37463 - local_irq_save(flags);
37464 + local_lock_irqsave(netdev_alloc_lock, flags);
37465 nc = this_cpu_ptr(&netdev_alloc_cache);
37466 data = page_frag_alloc(nc, fragsz, gfp_mask);
37467 - local_irq_restore(flags);
37468 + local_unlock_irqrestore(netdev_alloc_lock, flags);
37472 @@ -359,9 +362,13 @@
37474 static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
37476 - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
37477 + struct napi_alloc_cache *nc;
37480 - return page_frag_alloc(&nc->page, fragsz, gfp_mask);
37481 + nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37482 + data = page_frag_alloc(&nc->page, fragsz, gfp_mask);
37483 + put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37487 void *napi_alloc_frag(unsigned int fragsz)
37488 @@ -408,13 +415,13 @@
37489 if (sk_memalloc_socks())
37490 gfp_mask |= __GFP_MEMALLOC;
37492 - local_irq_save(flags);
37493 + local_lock_irqsave(netdev_alloc_lock, flags);
37495 nc = this_cpu_ptr(&netdev_alloc_cache);
37496 data = page_frag_alloc(nc, len, gfp_mask);
37497 pfmemalloc = nc->pfmemalloc;
37499 - local_irq_restore(flags);
37500 + local_unlock_irqrestore(netdev_alloc_lock, flags);
37502 if (unlikely(!data))
37504 @@ -455,9 +462,10 @@
37505 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
37508 - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
37509 + struct napi_alloc_cache *nc;
37510 struct sk_buff *skb;
37514 len += NET_SKB_PAD + NET_IP_ALIGN;
37516 @@ -475,7 +483,10 @@
37517 if (sk_memalloc_socks())
37518 gfp_mask |= __GFP_MEMALLOC;
37520 + nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37521 data = page_frag_alloc(&nc->page, len, gfp_mask);
37522 + pfmemalloc = nc->page.pfmemalloc;
37523 + put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37524 if (unlikely(!data))
37527 @@ -486,7 +497,7 @@
37530 /* use OR instead of assignment to avoid clearing of bits in mask */
37531 - if (nc->page.pfmemalloc)
37533 skb->pfmemalloc = 1;
37534 skb->head_frag = 1;
37536 @@ -718,23 +729,26 @@
37538 void __kfree_skb_flush(void)
37540 - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
37541 + struct napi_alloc_cache *nc;
37543 + nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37544 /* flush skb_cache if containing objects */
37545 if (nc->skb_count) {
37546 kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
37550 + put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37553 static inline void _kfree_skb_defer(struct sk_buff *skb)
37555 - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
37556 + struct napi_alloc_cache *nc;
37558 /* drop skb->head and call any destructors for packet */
37559 skb_release_all(skb);
37561 + nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37562 /* record skb to CPU local list */
37563 nc->skb_cache[nc->skb_count++] = skb;
37565 @@ -749,6 +763,7 @@
37569 + put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37571 void __kfree_skb_defer(struct sk_buff *skb)
37573 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/sock.c linux-4.14/net/core/sock.c
37574 --- linux-4.14.orig/net/core/sock.c 2018-09-05 11:03:25.000000000 +0200
37575 +++ linux-4.14/net/core/sock.c 2018-09-05 11:05:07.000000000 +0200
37576 @@ -2757,12 +2757,11 @@
37577 if (sk->sk_lock.owned)
37579 sk->sk_lock.owned = 1;
37580 - spin_unlock(&sk->sk_lock.slock);
37581 + spin_unlock_bh(&sk->sk_lock.slock);
37583 * The sk_lock has mutex_lock() semantics here:
37585 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
37586 - local_bh_enable();
37588 EXPORT_SYMBOL(lock_sock_nested);
37590 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/ipv4/icmp.c linux-4.14/net/ipv4/icmp.c
37591 --- linux-4.14.orig/net/ipv4/icmp.c 2018-09-05 11:03:25.000000000 +0200
37592 +++ linux-4.14/net/ipv4/icmp.c 2018-09-05 11:05:07.000000000 +0200
37594 #include <linux/string.h>
37595 #include <linux/netfilter_ipv4.h>
37596 #include <linux/slab.h>
37597 +#include <linux/locallock.h>
37598 #include <net/snmp.h>
37599 #include <net/ip.h>
37600 #include <net/route.h>
37601 @@ -204,6 +205,8 @@
37603 * On SMP we have one ICMP socket per-cpu.
37605 +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
37607 static struct sock *icmp_sk(struct net *net)
37609 return *this_cpu_ptr(net->ipv4.icmp_sk);
37610 @@ -214,12 +217,16 @@
37614 + if (!local_trylock(icmp_sk_lock))
37619 if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
37620 /* This can happen if the output path signals a
37621 * dst_link_failure() for an outgoing ICMP packet.
37623 + local_unlock(icmp_sk_lock);
37627 @@ -228,6 +235,7 @@
37628 static inline void icmp_xmit_unlock(struct sock *sk)
37630 spin_unlock(&sk->sk_lock.slock);
37631 + local_unlock(icmp_sk_lock);
37634 int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
37635 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/ipv4/tcp_ipv4.c linux-4.14/net/ipv4/tcp_ipv4.c
37636 --- linux-4.14.orig/net/ipv4/tcp_ipv4.c 2018-09-05 11:03:25.000000000 +0200
37637 +++ linux-4.14/net/ipv4/tcp_ipv4.c 2018-09-05 11:05:07.000000000 +0200
37639 #include <linux/init.h>
37640 #include <linux/times.h>
37641 #include <linux/slab.h>
37642 +#include <linux/locallock.h>
37644 #include <net/net_namespace.h>
37645 #include <net/icmp.h>
37646 @@ -580,6 +581,7 @@
37648 EXPORT_SYMBOL(tcp_v4_send_check);
37650 +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
37652 * This routine will send an RST to the other tcp.
37654 @@ -710,6 +712,7 @@
37655 arg.tos = ip_hdr(skb)->tos;
37656 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
37657 local_bh_disable();
37658 + local_lock(tcp_sk_lock);
37659 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
37660 skb, &TCP_SKB_CB(skb)->header.h4.opt,
37661 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
37662 @@ -717,6 +720,7 @@
37664 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
37665 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
37666 + local_unlock(tcp_sk_lock);
37669 #ifdef CONFIG_TCP_MD5SIG
37670 @@ -796,12 +800,14 @@
37672 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
37673 local_bh_disable();
37674 + local_lock(tcp_sk_lock);
37675 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
37676 skb, &TCP_SKB_CB(skb)->header.h4.opt,
37677 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
37678 &arg, arg.iov[0].iov_len);
37680 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
37681 + local_unlock(tcp_sk_lock);
37685 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/Kconfig linux-4.14/net/Kconfig
37686 --- linux-4.14.orig/net/Kconfig 2017-11-12 19:46:13.000000000 +0100
37687 +++ linux-4.14/net/Kconfig 2018-09-05 11:05:07.000000000 +0200
37688 @@ -272,7 +272,7 @@
37690 config NET_RX_BUSY_POLL
37693 + default y if !PREEMPT_RT_FULL
37697 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/mac80211/rx.c linux-4.14/net/mac80211/rx.c
37698 --- linux-4.14.orig/net/mac80211/rx.c 2018-09-05 11:03:25.000000000 +0200
37699 +++ linux-4.14/net/mac80211/rx.c 2018-09-05 11:05:07.000000000 +0200
37700 @@ -4252,7 +4252,7 @@
37701 struct ieee80211_supported_band *sband;
37702 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
37704 - WARN_ON_ONCE(softirq_count() == 0);
37705 + WARN_ON_ONCE_NONRT(softirq_count() == 0);
37707 if (WARN_ON(status->band >= NUM_NL80211_BANDS))
37709 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/netfilter/core.c linux-4.14/net/netfilter/core.c
37710 --- linux-4.14.orig/net/netfilter/core.c 2017-11-12 19:46:13.000000000 +0100
37711 +++ linux-4.14/net/netfilter/core.c 2018-09-05 11:05:07.000000000 +0200
37713 #include <linux/inetdevice.h>
37714 #include <linux/proc_fs.h>
37715 #include <linux/mutex.h>
37716 +#include <linux/locallock.h>
37717 #include <linux/mm.h>
37718 #include <linux/rcupdate.h>
37719 #include <net/net_namespace.h>
37722 #include "nf_internals.h"
37724 +#ifdef CONFIG_PREEMPT_RT_BASE
37725 +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
37726 +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
37729 static DEFINE_MUTEX(afinfo_mutex);
37731 const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
37732 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/packet/af_packet.c linux-4.14/net/packet/af_packet.c
37733 --- linux-4.14.orig/net/packet/af_packet.c 2018-09-05 11:03:25.000000000 +0200
37734 +++ linux-4.14/net/packet/af_packet.c 2018-09-05 11:05:07.000000000 +0200
37736 #include <linux/if_packet.h>
37737 #include <linux/wireless.h>
37738 #include <linux/kernel.h>
37739 +#include <linux/delay.h>
37740 #include <linux/kmod.h>
37741 #include <linux/slab.h>
37742 #include <linux/vmalloc.h>
37743 @@ -707,7 +708,7 @@
37744 if (BLOCK_NUM_PKTS(pbd)) {
37745 while (atomic_read(&pkc->blk_fill_in_prog)) {
37746 /* Waiting for skb_copy_bits to finish... */
37752 @@ -969,7 +970,7 @@
37753 if (!(status & TP_STATUS_BLK_TMO)) {
37754 while (atomic_read(&pkc->blk_fill_in_prog)) {
37755 /* Waiting for skb_copy_bits to finish... */
37760 prb_close_block(pkc, pbd, po, status);
37761 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/rds/ib_rdma.c linux-4.14/net/rds/ib_rdma.c
37762 --- linux-4.14.orig/net/rds/ib_rdma.c 2017-11-12 19:46:13.000000000 +0100
37763 +++ linux-4.14/net/rds/ib_rdma.c 2018-09-05 11:05:07.000000000 +0200
37765 #include <linux/slab.h>
37766 #include <linux/rculist.h>
37767 #include <linux/llist.h>
37768 +#include <linux/delay.h>
37770 #include "rds_single_path.h"
37772 @@ -210,7 +211,7 @@
37773 for_each_online_cpu(cpu) {
37774 flag = &per_cpu(clean_list_grace, cpu);
37775 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
37781 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/rxrpc/security.c linux-4.14/net/rxrpc/security.c
37782 --- linux-4.14.orig/net/rxrpc/security.c 2017-11-12 19:46:13.000000000 +0100
37783 +++ linux-4.14/net/rxrpc/security.c 2018-09-05 11:05:07.000000000 +0200
37785 #include <keys/rxrpc-type.h>
37786 #include "ar-internal.h"
37788 -static LIST_HEAD(rxrpc_security_methods);
37789 -static DECLARE_RWSEM(rxrpc_security_sem);
37791 static const struct rxrpc_security *rxrpc_security_types[] = {
37792 [RXRPC_SECURITY_NONE] = &rxrpc_no_security,
37793 #ifdef CONFIG_RXKAD
37794 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/sched/sch_api.c linux-4.14/net/sched/sch_api.c
37795 --- linux-4.14.orig/net/sched/sch_api.c 2017-11-12 19:46:13.000000000 +0100
37796 +++ linux-4.14/net/sched/sch_api.c 2018-09-05 11:05:07.000000000 +0200
37797 @@ -1081,7 +1081,7 @@
37798 rcu_assign_pointer(sch->stab, stab);
37800 if (tca[TCA_RATE]) {
37801 - seqcount_t *running;
37802 + net_seqlock_t *running;
37805 if (sch->flags & TCQ_F_MQROOT)
37806 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/sched/sch_generic.c linux-4.14/net/sched/sch_generic.c
37807 --- linux-4.14.orig/net/sched/sch_generic.c 2018-09-05 11:03:25.000000000 +0200
37808 +++ linux-4.14/net/sched/sch_generic.c 2018-09-05 11:05:07.000000000 +0200
37809 @@ -429,7 +429,11 @@
37810 .ops = &noop_qdisc_ops,
37811 .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
37812 .dev_queue = &noop_netdev_queue,
37813 +#ifdef CONFIG_PREEMPT_RT_BASE
37814 + .running = __SEQLOCK_UNLOCKED(noop_qdisc.running),
37816 .running = SEQCNT_ZERO(noop_qdisc.running),
37818 .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
37820 EXPORT_SYMBOL(noop_qdisc);
37821 @@ -628,9 +632,17 @@
37822 lockdep_set_class(&sch->busylock,
37823 dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
37825 +#ifdef CONFIG_PREEMPT_RT_BASE
37826 + seqlock_init(&sch->running);
37827 + lockdep_set_class(&sch->running.seqcount,
37828 + dev->qdisc_running_key ?: &qdisc_running_key);
37829 + lockdep_set_class(&sch->running.lock,
37830 + dev->qdisc_running_key ?: &qdisc_running_key);
37832 seqcount_init(&sch->running);
37833 lockdep_set_class(&sch->running,
37834 dev->qdisc_running_key ?: &qdisc_running_key);
37838 sch->enqueue = ops->enqueue;
37839 @@ -933,7 +945,7 @@
37840 /* Wait for outstanding qdisc_run calls. */
37841 list_for_each_entry(dev, head, close_list) {
37842 while (some_qdisc_is_busy(dev))
37845 /* The new qdisc is assigned at this point so we can safely
37846 * unwind stale skb lists and qdisc statistics
37848 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/sunrpc/svc_xprt.c linux-4.14/net/sunrpc/svc_xprt.c
37849 --- linux-4.14.orig/net/sunrpc/svc_xprt.c 2017-11-12 19:46:13.000000000 +0100
37850 +++ linux-4.14/net/sunrpc/svc_xprt.c 2018-09-05 11:05:07.000000000 +0200
37851 @@ -396,7 +396,7 @@
37856 + cpu = get_cpu_light();
37857 pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
37859 atomic_long_inc(&pool->sp_stats.packets);
37860 @@ -432,7 +432,7 @@
37862 atomic_long_inc(&pool->sp_stats.threads_woken);
37863 wake_up_process(rqstp->rq_task);
37869 @@ -453,7 +453,7 @@
37876 trace_svc_xprt_do_enqueue(xprt, rqstp);
37878 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/xfrm/xfrm_state.c linux-4.14/net/xfrm/xfrm_state.c
37879 --- linux-4.14.orig/net/xfrm/xfrm_state.c 2018-09-05 11:03:25.000000000 +0200
37880 +++ linux-4.14/net/xfrm/xfrm_state.c 2018-09-05 11:05:07.000000000 +0200
37881 @@ -427,7 +427,7 @@
37883 static void xfrm_state_gc_destroy(struct xfrm_state *x)
37885 - tasklet_hrtimer_cancel(&x->mtimer);
37886 + hrtimer_cancel(&x->mtimer);
37887 del_timer_sync(&x->rtimer);
37890 @@ -472,8 +472,8 @@
37892 static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
37894 - struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer);
37895 - struct xfrm_state *x = container_of(thr, struct xfrm_state, mtimer);
37896 + struct xfrm_state *x = container_of(me, struct xfrm_state, mtimer);
37897 + enum hrtimer_restart ret = HRTIMER_NORESTART;
37898 unsigned long now = get_seconds();
37899 long next = LONG_MAX;
37901 @@ -537,7 +537,8 @@
37902 km_state_expired(x, 0, 0);
37904 if (next != LONG_MAX) {
37905 - tasklet_hrtimer_start(&x->mtimer, ktime_set(next, 0), HRTIMER_MODE_REL);
37906 + hrtimer_forward_now(&x->mtimer, ktime_set(next, 0));
37907 + ret = HRTIMER_RESTART;
37911 @@ -554,7 +555,7 @@
37914 spin_unlock(&x->lock);
37915 - return HRTIMER_NORESTART;
37919 static void xfrm_replay_timer_handler(unsigned long data);
37920 @@ -573,8 +574,8 @@
37921 INIT_HLIST_NODE(&x->bydst);
37922 INIT_HLIST_NODE(&x->bysrc);
37923 INIT_HLIST_NODE(&x->byspi);
37924 - tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler,
37925 - CLOCK_BOOTTIME, HRTIMER_MODE_ABS);
37926 + hrtimer_init(&x->mtimer, CLOCK_BOOTTIME, HRTIMER_MODE_ABS_SOFT);
37927 + x->mtimer.function = xfrm_timer_handler;
37928 setup_timer(&x->rtimer, xfrm_replay_timer_handler,
37930 x->curlft.add_time = get_seconds();
37931 @@ -1031,7 +1032,9 @@
37932 hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
37934 x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
37935 - tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
37936 + hrtimer_start(&x->mtimer,
37937 + ktime_set(net->xfrm.sysctl_acq_expires, 0),
37938 + HRTIMER_MODE_REL_SOFT);
37939 net->xfrm.state_num++;
37940 xfrm_hash_grow_check(net, x->bydst.next != NULL);
37941 spin_unlock_bh(&net->xfrm.xfrm_state_lock);
37942 @@ -1142,7 +1145,7 @@
37943 hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
37946 - tasklet_hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
37947 + hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT);
37948 if (x->replay_maxage)
37949 mod_timer(&x->rtimer, jiffies + x->replay_maxage);
37951 @@ -1246,7 +1249,9 @@
37953 x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
37954 xfrm_state_hold(x);
37955 - tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
37956 + hrtimer_start(&x->mtimer,
37957 + ktime_set(net->xfrm.sysctl_acq_expires, 0),
37958 + HRTIMER_MODE_REL_SOFT);
37959 list_add(&x->km.all, &net->xfrm.state_all);
37960 hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
37961 h = xfrm_src_hash(net, daddr, saddr, family);
37962 @@ -1546,7 +1551,8 @@
37963 memcpy(&x1->lft, &x->lft, sizeof(x1->lft));
37966 - tasklet_hrtimer_start(&x1->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
37967 + hrtimer_start(&x1->mtimer, ktime_set(1, 0),
37968 + HRTIMER_MODE_REL_SOFT);
37969 if (x1->curlft.use_time)
37970 xfrm_state_check_expire(x1);
37972 @@ -1570,7 +1576,7 @@
37973 if (x->curlft.bytes >= x->lft.hard_byte_limit ||
37974 x->curlft.packets >= x->lft.hard_packet_limit) {
37975 x->km.state = XFRM_STATE_EXPIRED;
37976 - tasklet_hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL);
37977 + hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL_SOFT);
37981 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/samples/trace_events/trace-events-sample.c linux-4.14/samples/trace_events/trace-events-sample.c
37982 --- linux-4.14.orig/samples/trace_events/trace-events-sample.c 2017-11-12 19:46:13.000000000 +0100
37983 +++ linux-4.14/samples/trace_events/trace-events-sample.c 2018-09-05 11:05:07.000000000 +0200
37986 /* Silly tracepoints */
37987 trace_foo_bar("hello", cnt, array, random_strings[len],
37988 - ¤t->cpus_allowed);
37989 + current->cpus_ptr);
37991 trace_foo_with_template_simple("HELLO", cnt);
37993 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/scripts/mkcompile_h linux-4.14/scripts/mkcompile_h
37994 --- linux-4.14.orig/scripts/mkcompile_h 2017-11-12 19:46:13.000000000 +0100
37995 +++ linux-4.14/scripts/mkcompile_h 2018-09-05 11:05:07.000000000 +0200
38004 vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
38008 if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
38009 if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
38010 +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
38011 UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
38013 # Truncate to maximum length
38014 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/security/apparmor/include/path.h linux-4.14/security/apparmor/include/path.h
38015 --- linux-4.14.orig/security/apparmor/include/path.h 2017-11-12 19:46:13.000000000 +0100
38016 +++ linux-4.14/security/apparmor/include/path.h 2018-09-05 11:05:07.000000000 +0200
38020 #include <linux/percpu.h>
38021 -#include <linux/preempt.h>
38022 +#include <linux/locallock.h>
38024 DECLARE_PER_CPU(struct aa_buffers, aa_buffers);
38025 +DECLARE_LOCAL_IRQ_LOCK(aa_buffers_lock);
38027 #define COUNT_ARGS(X...) COUNT_ARGS_HELPER(, ##X, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
38028 #define COUNT_ARGS_HELPER(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, n, X...) n
38029 @@ -55,12 +56,24 @@
38031 #define for_each_cpu_buffer(I) for ((I) = 0; (I) < MAX_PATH_BUFFERS; (I)++)
38033 -#ifdef CONFIG_DEBUG_PREEMPT
38034 +#ifdef CONFIG_PREEMPT_RT_BASE
38036 +static inline void AA_BUG_PREEMPT_ENABLED(const char *s)
38038 + struct local_irq_lock *lv;
38040 + lv = this_cpu_ptr(&aa_buffers_lock);
38041 + WARN_ONCE(lv->owner != current,
38042 + "__get_buffer without aa_buffers_lock\n");
38045 +#elif defined(CONFIG_DEBUG_PREEMPT)
38046 #define AA_BUG_PREEMPT_ENABLED(X) AA_BUG(preempt_count() <= 0, X)
38048 #define AA_BUG_PREEMPT_ENABLED(X) /* nop */
38052 #define __get_buffer(N) ({ \
38053 struct aa_buffers *__cpu_var; \
38054 AA_BUG_PREEMPT_ENABLED("__get_buffer without preempt disabled"); \
38055 @@ -73,14 +86,14 @@
38057 #define get_buffers(X...) \
38059 - preempt_disable(); \
38060 + local_lock(aa_buffers_lock); \
38061 __get_buffers(X); \
38064 #define put_buffers(X, Y...) \
38066 __put_buffers(X, Y); \
38067 - preempt_enable(); \
38068 + local_unlock(aa_buffers_lock); \
38071 #endif /* __AA_PATH_H */
38072 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/security/apparmor/lsm.c linux-4.14/security/apparmor/lsm.c
38073 --- linux-4.14.orig/security/apparmor/lsm.c 2017-11-12 19:46:13.000000000 +0100
38074 +++ linux-4.14/security/apparmor/lsm.c 2018-09-05 11:05:07.000000000 +0200
38076 int apparmor_initialized;
38078 DEFINE_PER_CPU(struct aa_buffers, aa_buffers);
38080 +DEFINE_LOCAL_IRQ_LOCK(aa_buffers_lock);
38083 * LSM hook functions
38084 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/sound/core/pcm_native.c linux-4.14/sound/core/pcm_native.c
38085 --- linux-4.14.orig/sound/core/pcm_native.c 2018-09-05 11:03:25.000000000 +0200
38086 +++ linux-4.14/sound/core/pcm_native.c 2018-09-05 11:05:07.000000000 +0200
38087 @@ -148,7 +148,7 @@
38088 void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
38090 if (!substream->pcm->nonatomic)
38091 - local_irq_disable();
38092 + local_irq_disable_nort();
38093 snd_pcm_stream_lock(substream);
38095 EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
38096 @@ -163,7 +163,7 @@
38098 snd_pcm_stream_unlock(substream);
38099 if (!substream->pcm->nonatomic)
38100 - local_irq_enable();
38101 + local_irq_enable_nort();
38103 EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
38105 @@ -171,7 +171,7 @@
38107 unsigned long flags = 0;
38108 if (!substream->pcm->nonatomic)
38109 - local_irq_save(flags);
38110 + local_irq_save_nort(flags);
38111 snd_pcm_stream_lock(substream);
38114 @@ -189,7 +189,7 @@
38116 snd_pcm_stream_unlock(substream);
38117 if (!substream->pcm->nonatomic)
38118 - local_irq_restore(flags);
38119 + local_irq_restore_nort(flags);
38121 EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
38123 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/sound/drivers/dummy.c linux-4.14/sound/drivers/dummy.c
38124 --- linux-4.14.orig/sound/drivers/dummy.c 2017-11-12 19:46:13.000000000 +0100
38125 +++ linux-4.14/sound/drivers/dummy.c 2018-09-05 11:05:07.000000000 +0200
38126 @@ -376,17 +376,9 @@
38127 ktime_t period_time;
38129 struct hrtimer timer;
38130 - struct tasklet_struct tasklet;
38131 struct snd_pcm_substream *substream;
38134 -static void dummy_hrtimer_pcm_elapsed(unsigned long priv)
38136 - struct dummy_hrtimer_pcm *dpcm = (struct dummy_hrtimer_pcm *)priv;
38137 - if (atomic_read(&dpcm->running))
38138 - snd_pcm_period_elapsed(dpcm->substream);
38141 static enum hrtimer_restart dummy_hrtimer_callback(struct hrtimer *timer)
38143 struct dummy_hrtimer_pcm *dpcm;
38144 @@ -394,7 +386,14 @@
38145 dpcm = container_of(timer, struct dummy_hrtimer_pcm, timer);
38146 if (!atomic_read(&dpcm->running))
38147 return HRTIMER_NORESTART;
38148 - tasklet_schedule(&dpcm->tasklet);
38150 + * In cases of XRUN and draining, this calls .trigger to stop PCM
38153 + snd_pcm_period_elapsed(dpcm->substream);
38154 + if (!atomic_read(&dpcm->running))
38155 + return HRTIMER_NORESTART;
38157 hrtimer_forward_now(timer, dpcm->period_time);
38158 return HRTIMER_RESTART;
38160 @@ -404,7 +403,7 @@
38161 struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data;
38163 dpcm->base_time = hrtimer_cb_get_time(&dpcm->timer);
38164 - hrtimer_start(&dpcm->timer, dpcm->period_time, HRTIMER_MODE_REL);
38165 + hrtimer_start(&dpcm->timer, dpcm->period_time, HRTIMER_MODE_REL_SOFT);
38166 atomic_set(&dpcm->running, 1);
38169 @@ -414,14 +413,14 @@
38170 struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data;
38172 atomic_set(&dpcm->running, 0);
38173 - hrtimer_cancel(&dpcm->timer);
38174 + if (!hrtimer_callback_running(&dpcm->timer))
38175 + hrtimer_cancel(&dpcm->timer);
38179 static inline void dummy_hrtimer_sync(struct dummy_hrtimer_pcm *dpcm)
38181 hrtimer_cancel(&dpcm->timer);
38182 - tasklet_kill(&dpcm->tasklet);
38185 static snd_pcm_uframes_t
38186 @@ -466,12 +465,10 @@
38189 substream->runtime->private_data = dpcm;
38190 - hrtimer_init(&dpcm->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
38191 + hrtimer_init(&dpcm->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
38192 dpcm->timer.function = dummy_hrtimer_callback;
38193 dpcm->substream = substream;
38194 atomic_set(&dpcm->running, 0);
38195 - tasklet_init(&dpcm->tasklet, dummy_hrtimer_pcm_elapsed,
38196 - (unsigned long)dpcm);
38200 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/functions linux-4.14/tools/testing/selftests/ftrace/test.d/functions
38201 --- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/functions 2018-09-05 11:03:25.000000000 +0200
38202 +++ linux-4.14/tools/testing/selftests/ftrace/test.d/functions 2018-09-05 11:05:07.000000000 +0200
38204 echo 0 > events/enable
38207 +clear_synthetic_events() { # reset all current synthetic events
38208 + grep -v ^# synthetic_events |
38209 + while read line; do
38210 + echo "!$line" >> synthetic_events
38214 initialize_ftrace() { # Reset ftrace to initial-state
38215 # As the initial state, ftrace will be set to nop tracer,
38216 # no events, no triggers, no filters, no function filters,
38217 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc
38218 --- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc 1970-01-01 01:00:00.000000000 +0100
38219 +++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc 2018-09-05 11:05:07.000000000 +0200
38222 +# description: event trigger - test extended error support
38237 +if [ ! -f set_event ]; then
38238 + echo "event tracing is not supported"
38242 +if [ ! -f synthetic_events ]; then
38243 + echo "synthetic event is not supported"
38250 +echo "Test extended error support"
38251 +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
38252 +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_wakeup/trigger &>/dev/null
38253 +if ! grep -q "ERROR:" events/sched/sched_wakeup/hist; then
38254 + fail "Failed to generate extended error in histogram"
38260 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc
38261 --- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc 1970-01-01 01:00:00.000000000 +0100
38262 +++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc 2018-09-05 11:05:07.000000000 +0200
38265 +# description: event trigger - test field variable support
38279 +if [ ! -f set_event ]; then
38280 + echo "event tracing is not supported"
38284 +if [ ! -f synthetic_events ]; then
38285 + echo "synthetic event is not supported"
38289 +clear_synthetic_events
38293 +echo "Test field variable support"
38295 +echo 'wakeup_latency u64 lat; pid_t pid; int prio; char comm[16]' > synthetic_events
38296 +echo 'hist:keys=comm:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_waking/trigger
38297 +echo 'hist:keys=next_comm:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,sched.sched_waking.prio,next_comm) if next_comm=="ping"' > events/sched/sched_switch/trigger
38298 +echo 'hist:keys=pid,prio,comm:vals=lat:sort=pid,prio' > events/synthetic/wakeup_latency/trigger
38300 +ping localhost -c 3
38301 +if ! grep -q "ping" events/synthetic/wakeup_latency/hist; then
38302 + fail "Failed to create inter-event histogram"
38305 +if ! grep -q "synthetic_prio=prio" events/sched/sched_waking/hist; then
38306 + fail "Failed to create histogram with field variable"
38309 +echo '!hist:keys=next_comm:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,sched.sched_waking.prio,next_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
38311 +if grep -q "synthetic_prio=prio" events/sched/sched_waking/hist; then
38312 + fail "Failed to remove histogram with field variable"
38318 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc
38319 --- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc 1970-01-01 01:00:00.000000000 +0100
38320 +++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc 2018-09-05 11:05:07.000000000 +0200
38323 +# description: event trigger - test inter-event combined histogram trigger
38337 +if [ ! -f set_event ]; then
38338 + echo "event tracing is not supported"
38342 +if [ ! -f synthetic_events ]; then
38343 + echo "synthetic event is not supported"
38349 +clear_synthetic_events
38351 +echo "Test create synthetic event"
38353 +echo 'waking_latency u64 lat pid_t pid' > synthetic_events
38354 +if [ ! -d events/synthetic/waking_latency ]; then
38355 + fail "Failed to create waking_latency synthetic event"
38358 +echo "Test combined histogram"
38360 +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_waking/trigger
38361 +echo 'hist:keys=pid:waking_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).waking_latency($waking_lat,pid) if comm=="ping"' > events/sched/sched_wakeup/trigger
38362 +echo 'hist:keys=pid,lat:sort=pid,lat' > events/synthetic/waking_latency/trigger
38364 +echo 'wakeup_latency u64 lat pid_t pid' >> synthetic_events
38365 +echo 'hist:keys=pid:ts1=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_wakeup/trigger
38366 +echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts1:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid) if next_comm=="ping"' > events/sched/sched_switch/trigger
38368 +echo 'waking+wakeup_latency u64 lat; pid_t pid' >> synthetic_events
38369 +echo 'hist:keys=pid,lat:sort=pid,lat:ww_lat=$waking_lat+$wakeup_lat:onmatch(synthetic.wakeup_latency).waking+wakeup_latency($ww_lat,pid)' >> events/synthetic/wakeup_latency/trigger
38370 +echo 'hist:keys=pid,lat:sort=pid,lat' >> events/synthetic/waking+wakeup_latency/trigger
38372 +ping localhost -c 3
38373 +if ! grep -q "pid:" events/synthetic/waking+wakeup_latency/hist; then
38374 + fail "Failed to create combined histogram"
38380 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc
38381 --- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc 1970-01-01 01:00:00.000000000 +0100
38382 +++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc 2018-09-05 11:05:07.000000000 +0200
38385 +# description: event trigger - test inter-event histogram trigger onmatch action
38399 +if [ ! -f set_event ]; then
38400 + echo "event tracing is not supported"
38404 +if [ ! -f synthetic_events ]; then
38405 + echo "synthetic event is not supported"
38409 +clear_synthetic_events
38413 +echo "Test create synthetic event"
38415 +echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
38416 +if [ ! -d events/synthetic/wakeup_latency ]; then
38417 + fail "Failed to create wakeup_latency synthetic event"
38420 +echo "Test create histogram for synthetic event"
38421 +echo "Test histogram variables,simple expression support and onmatch action"
38423 +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
38424 +echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid,next_comm) if next_comm=="ping"' > events/sched/sched_switch/trigger
38425 +echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > events/synthetic/wakeup_latency/trigger
38426 +ping localhost -c 5
38427 +if ! grep -q "ping" events/synthetic/wakeup_latency/hist; then
38428 + fail "Failed to create onmatch action inter-event histogram"
38434 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc
38435 --- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc 1970-01-01 01:00:00.000000000 +0100
38436 +++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc 2018-09-05 11:05:07.000000000 +0200
38439 +# description: event trigger - test inter-event histogram trigger onmatch-onmax action
38453 +if [ ! -f set_event ]; then
38454 + echo "event tracing is not supported"
38458 +if [ ! -f synthetic_events ]; then
38459 + echo "synthetic event is not supported"
38463 +clear_synthetic_events
38467 +echo "Test create synthetic event"
38469 +echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
38470 +if [ ! -d events/synthetic/wakeup_latency ]; then
38471 + fail "Failed to create wakeup_latency synthetic event"
38474 +echo "Test create histogram for synthetic event"
38475 +echo "Test histogram variables,simple expression support and onmatch-onmax action"
38477 +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
38478 +echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid,next_comm):onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
38479 +echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > events/synthetic/wakeup_latency/trigger
38480 +ping localhost -c 5
38481 +if [ ! grep -q "ping" events/synthetic/wakeup_latency/hist -o ! grep -q "max:" events/sched/sched_switch/hist]; then
38482 + fail "Failed to create onmatch-onmax action inter-event histogram"
38488 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc
38489 --- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc 1970-01-01 01:00:00.000000000 +0100
38490 +++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc 2018-09-05 11:05:07.000000000 +0200
38493 +# description: event trigger - test inter-event histogram trigger onmax action
38507 +if [ ! -f set_event ]; then
38508 + echo "event tracing is not supported"
38512 +if [ ! -f synthetic_events ]; then
38513 + echo "synthetic event is not supported"
38517 +clear_synthetic_events
38521 +echo "Test create synthetic event"
38523 +echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
38524 +if [ ! -d events/synthetic/wakeup_latency ]; then
38525 + fail "Failed to create wakeup_latency synthetic event"
38528 +echo "Test onmax action"
38530 +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_waking/trigger
38531 +echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
38532 +ping localhost -c 3
38533 +if ! grep -q "max:" events/sched/sched_switch/hist; then
38534 + fail "Failed to create onmax action inter-event histogram"
38540 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc
38541 --- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc 1970-01-01 01:00:00.000000000 +0100
38542 +++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc 2018-09-05 11:05:07.000000000 +0200
38545 +# description: event trigger - test synthetic event create remove
38558 +if [ ! -f set_event ]; then
38559 + echo "event tracing is not supported"
38563 +if [ ! -f synthetic_events ]; then
38564 + echo "synthetic event is not supported"
38568 +clear_synthetic_events
38572 +echo "Test create synthetic event"
38574 +echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
38575 +if [ ! -d events/synthetic/wakeup_latency ]; then
38576 + fail "Failed to create wakeup_latency synthetic event"
38581 +echo "Test create synthetic event with an error"
38582 +echo 'wakeup_latency u64 lat pid_t pid char' > synthetic_events > /dev/null
38583 +if [ -d events/synthetic/wakeup_latency ]; then
38584 + fail "Created wakeup_latency synthetic event with an invalid format"
38589 +echo "Test remove synthetic event"
38590 +echo '!wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
38591 +if [ -d events/synthetic/wakeup_latency ]; then
38592 + fail "Failed to delete wakeup_latency synthetic event"
38598 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/virt/kvm/arm/arm.c linux-4.14/virt/kvm/arm/arm.c
38599 --- linux-4.14.orig/virt/kvm/arm/arm.c 2018-09-05 11:03:25.000000000 +0200
38600 +++ linux-4.14/virt/kvm/arm/arm.c 2018-09-05 11:05:07.000000000 +0200
38603 static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
38605 - BUG_ON(preemptible());
38606 __this_cpu_write(kvm_arm_running_vcpu, vcpu);
38611 struct kvm_vcpu *kvm_arm_get_running_vcpu(void)
38613 - BUG_ON(preemptible());
38614 return __this_cpu_read(kvm_arm_running_vcpu);
38617 @@ -653,7 +651,7 @@
38618 * involves poking the GIC, which must be done in a
38619 * non-preemptible context.
38621 - preempt_disable();
38622 + migrate_disable();
38624 kvm_pmu_flush_hwstate(vcpu);
38626 @@ -690,7 +688,7 @@
38627 kvm_pmu_sync_hwstate(vcpu);
38628 kvm_timer_sync_hwstate(vcpu);
38629 kvm_vgic_sync_hwstate(vcpu);
38630 - preempt_enable();
38631 + migrate_enable();
38635 @@ -745,7 +743,7 @@
38637 kvm_vgic_sync_hwstate(vcpu);
38639 - preempt_enable();
38640 + migrate_enable();
38642 ret = handle_exit(vcpu, run, ret);